//! This module contains the implementation of the `NsedAgent`.
use crate::agents::ChatCapable;
use crate::agents::DeliberationPhase;
use crate::agents::normalize_score;
use crate::agents::{
AgentConfig, AgentContext, CategoryScores, ClaimAssessment, DisagreementPoint, Evaluation,
NsedAgent, Proposal, Stance, TokenUsage,
};
use crate::emit_for;
use crate::llms::LlmRequestSpan;
use crate::llms::{AiModel, RequestConfig};
use crate::prompts::PromptSet;
use crate::telemetry::RetryReason;
use crate::tools::Tool;
use crate::tools::context::{
ReadCritiquesTool, ReadOwnProposalTool, ReadProposalTool, SearchDeliberationTool,
};
use crate::tools::user_call::UserCallTool;
use anyhow::{Context, Result};
use async_openai::types::{
ChatCompletionRequestAssistantMessage, ChatCompletionRequestMessage,
ChatCompletionRequestSystemMessage, ChatCompletionRequestToolMessage,
ChatCompletionRequestToolMessageContent, ChatCompletionRequestUserMessage, ChatCompletionTool,
ChatCompletionToolType, FunctionObject,
};
use async_trait::async_trait;
use llm_repair::{
clean_json_string, extract_evaluations_from_markdown, extract_proposal_from_markdown,
extract_python_tool_calls, extract_xml_tool_calls, repair_aggressive_escapes,
repair_conversational_response, repair_invalid_escapes, repair_tool_calls,
repair_truncated_json, sanitize_json_string_lossy,
};
use serde::Deserialize;
use serde::de::DeserializeOwned;
use serde_json::{Value, json};
use std::collections::HashMap;
// use std::sync::Arc; // Unused
use tracing::{debug, info, instrument, warn};
/// Per-call cap on tool output as a fraction of the agent's
/// remaining context budget. 10% means a tool result never exceeds
/// 10% of `(context_window − estimated_input_tokens)` (in chars).
const TOOL_OUTPUT_FRACTION: f32 = 0.10;
/// Bytes-per-token rule of thumb. Conservative; matches the
/// `chars_per_token` ratio used elsewhere in the SDK. Used to map
/// the token-budget cap onto a byte cap on the raw `tool_output`.
const CHARS_PER_TOKEN: f32 = 4.0;
/// Trigger condition for the M3 auto-invocation: a serialized
/// message history that's already eaten ≥90% of the agent's
/// context window AND has at least a few tool calls' worth of
/// content to fold (`message_count > 4` ≈ system+user+≥1
/// assistant+≥1 tool). When `context_window <= 0` (provider doesn't
/// expose it) the trigger is disabled.
fn should_auto_compact(
messages_chars: usize,
context_window: i32,
message_count: usize,
chars_per_token: f32,
) -> bool {
if context_window <= 0 || message_count <= 4 {
return false;
}
let cpt = if chars_per_token > 0.0 {
chars_per_token
} else {
4.0
};
let estimated_tokens = (messages_chars as f32 / cpt) as i32;
let pct = (estimated_tokens as f64 / context_window as f64) * 100.0;
pct >= 90.0
}
/// Outcome of a `compact_history` call.
pub struct CompactionResult {
/// New message history with older tool calls folded into a
/// synthetic `compact_history` tool-result pair.
pub new_messages: Vec<ChatCompletionRequestMessage>,
/// LLM-generated summary text to append to the scratchpad.
pub summary: String,
/// Number of tool-call results that were folded.
pub compacted_count: usize,
}
/// Build a one-shot LLM request that summarises the older portion
/// of the conversation. Returns the new history (preamble + a
/// single synthetic compaction pair + the most-recent N tool
/// calls) and the summary text.
pub async fn compact_message_history(
llm_client: &dyn crate::llms::AiModel,
agent_config: &AgentConfig,
messages: &[ChatCompletionRequestMessage],
keep_last_n_calls: usize,
) -> anyhow::Result<CompactionResult> {
use crate::llms::RequestConfig;
use async_openai::types::ChatCompletionMessageToolCall;
use async_openai::types::FunctionCall;
// Treat keep=0 as keep=1: keeping zero recent tool results would
// index `tool_msg_indices[len()]` and panic.
let keep_last_n_calls = keep_last_n_calls.max(1);
// Under `disable_native_tools` tool outputs are rewritten into
// User messages prefixed with `Tool Output (` — fold those too.
let is_tool_boundary = |m: &ChatCompletionRequestMessage| -> bool {
match m {
ChatCompletionRequestMessage::Tool(_) => true,
ChatCompletionRequestMessage::User(u) => {
if let async_openai::types::ChatCompletionRequestUserMessageContent::Text(t) =
&u.content
{
t.starts_with("Tool Output (")
} else {
false
}
}
_ => false,
}
};
let tool_msg_indices: Vec<usize> = messages
.iter()
.enumerate()
.filter(|(_, m)| is_tool_boundary(m))
.map(|(i, _)| i)
.collect();
if tool_msg_indices.len() <= keep_last_n_calls {
return Ok(CompactionResult {
new_messages: messages.to_vec(),
summary: String::new(),
compacted_count: 0,
});
}
// Walk back to the parent Assistant turn so the kept tool_result
// doesn't reference tool_calls that got folded into the summary —
// the provider rejects orphaned tool messages.
let raw_tool_cut = tool_msg_indices[tool_msg_indices.len() - keep_last_n_calls];
let cut_idx = (0..raw_tool_cut)
.rev()
.find(|&i| matches!(messages[i], ChatCompletionRequestMessage::Assistant(_)))
.unwrap_or(raw_tool_cut);
// Anchor the preamble at the assistant turn, not the tool turn,
// so we don't leave an orphan `tool_calls` whose result got folded.
let first_foldable_tool_idx = messages
.iter()
.take(cut_idx)
.position(is_tool_boundary)
.unwrap_or(cut_idx);
let preamble_end = (0..first_foldable_tool_idx)
.rev()
.find(|&i| matches!(messages[i], ChatCompletionRequestMessage::Assistant(_)))
.unwrap_or(first_foldable_tool_idx);
let to_summarize = &messages[preamble_end..cut_idx];
let mut to_summarize_text = String::new();
for m in to_summarize {
match m {
ChatCompletionRequestMessage::Tool(t) => {
if let ChatCompletionRequestToolMessageContent::Text(s) = &t.content {
to_summarize_text.push_str("[tool_result] ");
to_summarize_text.push_str(s);
to_summarize_text.push_str("\n\n");
}
}
ChatCompletionRequestMessage::Assistant(a) => {
if let Some(tcs) = &a.tool_calls {
for tc in tcs {
to_summarize_text.push_str(&format!(
"[tool_call] {}({})\n",
tc.function.name, tc.function.arguments
));
}
}
}
ChatCompletionRequestMessage::User(u) => {
// `disable_native_tools` mode: tool outputs land here
// as text-rewritten User messages. Without this branch
// the summariser sees zero tool evidence and the
// compaction silently strips the actual data.
if let async_openai::types::ChatCompletionRequestUserMessageContent::Text(t) =
&u.content
&& t.starts_with("Tool Output (")
{
to_summarize_text.push_str("[tool_result] ");
to_summarize_text.push_str(t);
to_summarize_text.push_str("\n\n");
}
}
_ => {}
}
}
let summarize_prompt = format!(
"You are compressing an agent's accumulated tool-call history \
so the agent can keep reasoning under tighter context pressure. \
The summary will REPLACE the calls below in the conversation, \
so anything you drop is gone.\n\n\
Produce a structured summary using this exact section order. \
Every section starts with an imperative — do what it says.\n\n\
1. User Request and Primary Intent — Capture all explicit asks, \
constraints, and the underlying goal. Look for: original request \
phrasing, mid-task clarifications, success criteria, anything \
the user explicitly forbade.\n\
2. Methodology & Technical Concepts — List all approaches, \
algorithms, APIs, formats, protocols, and libraries in play. \
Look for: framework names and versions, data formats, patterns \
being followed (TDD, RAG, …), and the rationale for choosing them.\n\
3. References and Quotes — Capture all file paths read with \
line ranges, verbatim quotes the agent has cited (≤ 1 line each), \
and peer/external outputs referenced. Look for: file path + \
line range for every read, commit SHAs, URLs, identifier names. \
Drop full file dumps.\n\
4. Errors, Fixes and Learnings — List all errors encountered, \
the fix applied (or that it remains open), and the generalisable \
lesson. Look for: exact error message, root cause, the change \
that resolved it, what to avoid next time.\n\
5. User Messages — Capture all explicit instructions, corrections, \
and constraints from the user/orchestrator turn-by-turn. Look for: \
verbatim phrasing that disambiguates intent, `stop`/`don't`/\
`instead` pivots, deadlines, scope cuts.\n\
6. Pending Work — List all open tool calls, unfinished sub-goals, \
and known follow-ups. Look for: what is queued, why it hasn't \
been done yet, dependencies on other steps.\n\
7. Current Work — Capture the agent's most recent action and what \
it was about to do next. Look for: the in-flight tool call, \
the file/function being inspected, the immediate next move.\n\n\
=== HISTORY TO SUMMARISE ===\n{to_summarize_text}"
);
let request_config = RequestConfig {
messages: vec![
ChatCompletionRequestUserMessage {
content: summarize_prompt.into(),
..Default::default()
}
.into(),
],
tools: None,
tool_choice: None,
presence_penalty: None,
};
let result = llm_client
.chat_completion(agent_config, request_config)
.await
.map_err(|e| anyhow::anyhow!("compact_history summariser call failed: {e}"))?;
let summary = result
.response
.choices
.first()
.and_then(|c| c.message.content.clone())
.filter(|s| !s.trim().is_empty())
.ok_or_else(|| anyhow::anyhow!("compact_history summariser returned empty content"))?;
// `total - keep_last_n_calls` lies when an assistant batch emits
// multiple tool calls; count what's actually before `cut_idx`.
let actually_compacted = tool_msg_indices
.iter()
.take_while(|&&i| i < cut_idx)
.count();
// When `disable_native_tools` is on, the rest of the loop flattens
// tool traffic into User-message text; injecting native tool_calls
// + Tool roles here would land in a request that sets `tools: None`
// and the provider rejects the protocol mismatch. Mirror the
// existing `Tool Output (...)` rewrite convention so subsequent
// is_tool_boundary scans still recognise the pair.
let (synth_assistant, synth_tool_result): (
ChatCompletionRequestMessage,
ChatCompletionRequestMessage,
) = if agent_config.disable_native_tools {
let assistant: ChatCompletionRequestMessage = ChatCompletionRequestAssistantMessage {
content: Some(
async_openai::types::ChatCompletionRequestAssistantMessageContent::Text(format!(
"[compact_history(keep_last_n_calls={keep_last_n_calls})]"
)),
),
tool_calls: None,
..Default::default()
}
.into();
let result: ChatCompletionRequestMessage = ChatCompletionRequestUserMessage {
content: format!(
"Tool Output (compact_history): [Compacted {actually_compacted} earlier \
tool calls into scratchpad. Summary:]\n{summary}"
)
.into(),
..Default::default()
}
.into();
(assistant, result)
} else {
let compact_tool_call_id = format!("compact_history_{}", uuid::Uuid::new_v4().simple());
let assistant: ChatCompletionRequestMessage = ChatCompletionRequestAssistantMessage {
tool_calls: Some(vec![ChatCompletionMessageToolCall {
id: compact_tool_call_id.clone(),
r#type: ChatCompletionToolType::Function,
function: FunctionCall {
name: "compact_history".into(),
arguments: json!({ "keep_last_n_calls": keep_last_n_calls }).to_string(),
},
}]),
..Default::default()
}
.into();
let result: ChatCompletionRequestMessage = ChatCompletionRequestToolMessage {
tool_call_id: compact_tool_call_id,
content: ChatCompletionRequestToolMessageContent::Text(format!(
"[Compacted {actually_compacted} earlier tool calls into scratchpad. \
Summary:]\n{summary}"
)),
}
.into();
(assistant, result)
};
let mut new_messages = messages[..preamble_end].to_vec();
new_messages.push(synth_assistant);
new_messages.push(synth_tool_result);
new_messages.extend(messages[cut_idx..].iter().cloned());
Ok(CompactionResult {
new_messages,
summary,
compacted_count: actually_compacted,
})
}
/// Squeeze a scratchpad whose length has crossed the
/// `agent_config.scratchpad_squeeze_fraction` threshold. Calls the
/// agent's own LLM to compact older sections; preserves the trailing
/// 25% verbatim so the most-recent context stays intact.
pub async fn squeeze_scratchpad_if_full(
llm_client: &dyn crate::llms::AiModel,
agent_config: &AgentConfig,
scratchpad: &str,
max_scratchpad_size: usize,
) -> anyhow::Result<Option<String>> {
use crate::llms::RequestConfig;
if max_scratchpad_size == 0 || scratchpad.is_empty() {
return Ok(None);
}
let ratio = scratchpad.len() as f64 / max_scratchpad_size as f64;
if ratio < agent_config.scratchpad_squeeze_fraction {
return Ok(None);
}
let cut = scratchpad.len() * 3 / 4;
let safe_cut = (0..=cut)
.rev()
.find(|&i| scratchpad.is_char_boundary(i))
.unwrap_or(0);
let (older, recent) = scratchpad.split_at(safe_cut);
let prompt = format!(
"The agent's scratchpad is at {pct}% of its capacity \
({cur}/{max} chars). Compress the OLDER section using the \
numbered template below. The RECENT section will be appended \
verbatim — DO NOT touch it.\n\n\
The scratchpad is the agent's persistent reasoning notebook \
across rounds of a multi-agent deliberation. Every section \
starts with an imperative — do what it says.\n\n\
1. Primary Request and Intent — Capture the deliberation task \
verbatim plus the success criterion the orchestrator set. Look \
for: the framing question, scoring rubric, hard constraints.\n\
2. Key Findings and Stances — List all conclusions this agent \
has reached and the position it is defending. Look for: claim \
+ supporting evidence + confidence level.\n\
3. Rounds and Phases — For each round/phase already assessed, \
capture a one-line conclusion. Look for: round id, phase \
(Propose/Evaluate/Refine), what changed since the previous \
round.\n\
4. Key Team Disagreements — List all points where peer agents \
diverged and the evidence each side cited. Look for: peer \
name, claim, counter-claim, evidence anchor.\n\
5. Decisions Made — Capture every commitment the agent has \
made and no longer plans to revisit. Look for: decision + \
reason + when made.\n\
6. References and Quotes — Capture all file paths read with \
line ranges, verbatim quotes (≤ 1 line each), and peer outputs \
referenced. Look for: path + line range, commit SHAs, URLs, \
identifier names. Drop full dumps.\n\
7. Errors, Fixes and Learnings — List all errors, the fix, and \
the generalisable lesson. Look for: error message, root cause, \
the change that resolved it.\n\
8. Current Focus — Capture what the agent is reasoning about \
right now and what would change its mind. Look for: the open \
question, the falsifier, the next piece of evidence sought.\n\
9. Optional Next Step — Capture the single most useful next \
move, or `None`. Look for: a concrete tool call or peer reply \
the agent should issue immediately.\n\n\
Drop boilerplate, process narration, and anything already \
implied by the RECENT section.\n\n\
=== OLDER (compress this) ===\n{older}\n\n\
=== RECENT (do not touch) ===\n{recent}",
pct = (ratio * 100.0).round() as u32,
cur = scratchpad.len(),
max = max_scratchpad_size,
);
let request_config = RequestConfig {
messages: vec![
ChatCompletionRequestUserMessage {
content: prompt.into(),
..Default::default()
}
.into(),
],
tools: None,
tool_choice: None,
presence_penalty: None,
};
let result = llm_client
.chat_completion(agent_config, request_config)
.await
.map_err(|e| anyhow::anyhow!("scratchpad squeeze summariser call failed: {e}"))?;
let compressed = result
.response
.choices
.first()
.and_then(|c| c.message.content.clone())
.filter(|s| !s.trim().is_empty())
.ok_or_else(|| anyhow::anyhow!("scratchpad squeeze summariser returned empty content"))?;
let candidate = format!("{compressed}\n\n{recent}");
if candidate.len() >= scratchpad.len() || candidate.len() > max_scratchpad_size {
warn!(
old_len = scratchpad.len(),
new_len = candidate.len(),
max = max_scratchpad_size,
"scratchpad squeeze produced no improvement; keeping original"
);
return Ok(None);
}
Ok(Some(candidate))
}
/// Apply [`TOOL_OUTPUT_FRACTION`] cap to `tool_output` in place.
/// Truncates on a UTF-8 char boundary and appends a marker so the
/// model sees that the tool clipped its result. Returns `true`
/// when the cap engaged.
///
/// `context_window <= 0` (provider doesn't expose it) preserves
/// the legacy uncapped behavior; the SDK's downstream shrink-guard
/// is the only protection in that case.
fn apply_tool_output_cap(
tool_output: &mut String,
context_window: i32,
estimated_input_tokens: u32,
tool_name: &str,
) -> bool {
if context_window <= 0 {
return false;
}
let remaining_tokens = (context_window as i64 - estimated_input_tokens as i64).max(0) as f32;
let cap_bytes = (remaining_tokens * CHARS_PER_TOKEN * TOOL_OUTPUT_FRACTION) as usize;
if tool_output.len() <= cap_bytes {
return false;
}
// Zero remaining context budget — replace the result with a
// marker. Anything else lets the SDK shrink-guard fire on the
// next call with a 200-token output budget, breaking downstream
// JSON.
if cap_bytes == 0 {
let original_len = tool_output.len();
tool_output.clear();
tool_output.push_str(
"[truncated: no remaining context budget; \
re-call with offset/regex/page to scope the read]",
);
warn!(
tool_name = %tool_name,
original_len,
cap_bytes = 0,
"tool result fully replaced with marker; context exhausted"
);
return true;
}
let original_len = tool_output.len();
let safe_cap = (0..=cap_bytes)
.rev()
.find(|&i| tool_output.is_char_boundary(i))
.unwrap_or(0);
tool_output.truncate(safe_cap);
tool_output.push_str(&format!(
"\n[truncated: tool result clipped at {} bytes \
({:.0}% of remaining context); original was {} bytes — \
re-call with offset/regex/page to scope the read]",
safe_cap,
TOOL_OUTPUT_FRACTION * 100.0,
original_len
));
warn!(
tool_name = %tool_name,
original_len,
cap_bytes = safe_cap,
"tool result truncated to per-call context-budget cap"
);
true
}
/// Write a structured failure dump to disk (opt-in via `NSED_FAILURE_DUMPS=1|full`).
///
/// Dumps are organized as **one directory per job** under `failures/`:
///
/// ```text
/// failures/
/// a1b2c3d4_DEFAULT/ # session_id prefix + agent name
/// parse_error_r2.md # all parse retries for round 2 appended here
/// api_error_r3.md # API failure in round 3
/// e5f6g7h8_ARCHIT/
/// parse_error_r1.md
/// ```
///
/// Subsequent failures for the same job+kind+round are **appended** rather
/// than creating new files, so you can follow the retry evolution in one doc.
/// Full context (system prompt, request body, messages) is only written on the
/// first entry — retries share the same prompt, so repeating it wastes space.
///
/// Returns `Some(path)` on success so callers can log the filename.
fn write_failure_dump(params: FailureDumpParams<'_>) -> Option<String> {
// Config value takes precedence over env var. When the config explicitly
// says "off", we must NOT fall through to the env var.
let dump_mode = match params.failure_dumps_config {
Some(v) => {
let v = v.trim().to_lowercase();
if v == "off" || v.is_empty() {
None // Config explicitly disabled — short-circuit, skip env var
} else {
Some(v)
}
}
None => std::env::var("NSED_FAILURE_DUMPS").ok(),
};
let dump_mode = dump_mode?;
// Only enable dumps for explicit opt-in values: "1", "on", or "full".
let is_full = dump_mode.eq_ignore_ascii_case("full");
if dump_mode != "1" && !dump_mode.eq_ignore_ascii_case("on") && !is_full {
return None;
}
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let safe_name = params.agent_name.replace(['/', '\\', ' ', '(', ')'], "_");
// ── Job directory: one per session (or fallback key) ─────────────
let job_dir_name = if let Some(sid) = params.session_id {
let safe_session = sid.replace(['/', '\\', ' ', '(', ')', '.', ':'], "_");
let short: String = safe_session.chars().take(8).collect();
format!("{short}_{safe_name}")
} else {
// No session_id: use a timestamp-based directory so we still group
// within a single invocation (parse retries hit the same dir).
format!("{timestamp}_{safe_name}")
};
let job_dir = format!("failures/{job_dir_name}");
// File within the job directory: one per error kind + round
let round = params.round.unwrap_or(0);
let filename = format!("{job_dir}/{kind}_r{round}.md", kind = params.kind);
// ── Determine if this is a new file or an append ─────────────────
let is_append = std::path::Path::new(&filename).exists();
let mut out = String::with_capacity(4096);
if is_append {
out.push_str("\n---\n\n");
}
// ── Section header ───────────────────────────────────────────────
let attempt_label = params
.attempt
.map(|a| format!(" (attempt {a})"))
.unwrap_or_default();
out.push_str(&format!(
"# {kind}{attempt_label} — {timestamp}\n\n",
kind = params.kind,
));
// ── Metadata table ───────────────────────────────────────────────
out.push_str("| field | value |\n|---|---|\n");
out.push_str(&format!("| agent | {} |\n", params.agent_name));
out.push_str(&format!("| model | {} |\n", params.model_name));
out.push_str(&format!("| provider | {} |\n", params.provider_id));
if let Some(sid) = params.session_id {
out.push_str(&format!("| session_id | {sid} |\n"));
}
if let Some(phase) = params.phase {
out.push_str(&format!("| phase | {phase} |\n"));
}
out.push_str(&format!("| round | {round} |\n"));
if let Some(attempt) = params.attempt {
out.push_str(&format!("| attempt | {attempt} |\n"));
}
if let Some(fr) = params.finish_reason {
out.push_str(&format!("| finish_reason | {fr} |\n"));
}
if let Some(t) = params.input_tokens {
out.push_str(&format!("| input_tokens | {t} |\n"));
}
if let Some(t) = params.output_tokens {
out.push_str(&format!("| output_tokens | {t} |\n"));
}
out.push('\n');
// ── Error ────────────────────────────────────────────────────────
out.push_str("## Error\n\n```\n");
out.push_str(params.error);
out.push_str("\n```\n\n");
// ── LLM response (parse-error only) ──────────────────────────────
if let Some(response) = params.response_content {
let truncated = truncate_for_dump(response, 4000);
out.push_str("## LLM Response\n\n```\n");
out.push_str(&truncated);
out.push_str("\n```\n\n");
}
// ── Full context (opt-in, first entry only) ──────────────────────
// System prompt and messages are identical across retries — only
// include them on the first write to save space + context.
if is_full && !is_append {
if let Some(sys) = params.system_prompt {
out.push_str("## System Prompt\n\n```\n");
out.push_str(sys);
out.push_str("\n```\n\n");
}
if let Some(body) = params.request_body {
let truncated = truncate_for_dump(body, 8000);
out.push_str("## Request Body\n\n```json\n");
out.push_str(&truncated);
out.push_str("\n```\n\n");
}
if let Some(msgs) = params.messages {
let msgs_json =
serde_json::to_string_pretty(msgs).unwrap_or_else(|_| format!("{msgs:#?}"));
let truncated = truncate_for_dump(&msgs_json, 12000);
out.push_str("## Messages\n\n```json\n");
out.push_str(&truncated);
out.push_str("\n```\n\n");
}
} else if !is_full && !is_append {
out.push_str(
"_Set `NSED_FAILURE_DUMPS=full` to include system prompt, request body, and messages._\n",
);
}
// ── Write / append to disk ───────────────────────────────────────
let write_result = std::fs::create_dir_all(&job_dir).and_then(|_| {
use std::io::Write;
let mut file = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(&filename)?;
file.write_all(out.as_bytes())
});
if let Err(io_err) = write_result {
warn!("Failed to write failure dump: {}", io_err);
return None;
}
// ── Prune old job directories (NSED_FAILURE_DUMPS_MAX, default 20)
let max_dirs: usize = std::env::var("NSED_FAILURE_DUMPS_MAX")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(20);
prune_failure_dirs("failures", max_dirs);
Some(filename)
}
/// Remove oldest **job directories** from `failures/` when the count exceeds
/// `max_dirs`. Sorts by filesystem modified-time (oldest first) and deletes
/// the surplus recursively. Best-effort: silently ignores I/O errors.
fn prune_failure_dirs(parent: &str, max_dirs: usize) {
let entries: Vec<_> = match std::fs::read_dir(parent) {
Ok(rd) => rd
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect(),
Err(_) => return,
};
if entries.len() <= max_dirs {
return;
}
let mut by_mtime: Vec<_> = entries
.into_iter()
.filter_map(|e| {
let mtime = e.metadata().ok()?.modified().ok()?;
Some((mtime, e.path()))
})
.collect();
by_mtime.sort_by_key(|(t, _)| *t);
let to_remove = by_mtime.len().saturating_sub(max_dirs);
for (_, path) in by_mtime.into_iter().take(to_remove) {
let _ = std::fs::remove_dir_all(path);
}
}
/// Parameters for [`write_failure_dump`].
struct FailureDumpParams<'a> {
kind: &'a str,
agent_name: &'a str,
model_name: &'a str,
provider_id: &'a str,
error: &'a str,
session_id: Option<&'a str>,
phase: Option<&'a str>,
round: Option<u32>,
attempt: Option<usize>,
finish_reason: Option<&'a str>,
input_tokens: Option<u32>,
output_tokens: Option<u32>,
response_content: Option<&'a str>,
system_prompt: Option<&'a str>,
request_body: Option<&'a str>,
messages: Option<&'a [ChatCompletionRequestMessage]>,
/// From `AgentConfig.failure_dumps` — takes precedence over env var.
failure_dumps_config: Option<&'a str>,
}
/// Truncate a string for dump files. Respects char boundaries.
fn truncate_for_dump(s: &str, max_chars: usize) -> String {
if s.chars().count() <= max_chars {
s.to_string()
} else {
let truncated: String = s.chars().take(max_chars).collect();
format!("{truncated}\n... (truncated at {max_chars} chars)")
}
}
// Define AgentResponse locally
#[derive(Debug, Clone)]
pub struct AgentResponse {
pub content: String,
pub tool_usage: HashMap<String, usize>,
pub finish_reason: Option<String>,
pub input_tokens: Option<u32>,
pub output_tokens: Option<u32>,
pub system_prompt: Option<String>,
pub request_body: Option<String>,
pub history: Vec<ChatCompletionRequestMessage>,
pub final_scratchpad: Option<String>,
}
#[derive(Debug, Deserialize, serde::Serialize)]
#[allow(dead_code)]
struct BatchEvaluationItem {
#[serde(alias = "candidate_id", default)]
agent_id: String,
#[serde(alias = "score")]
endorsement_weight: f32,
#[serde(default)]
justification: Option<String>,
#[serde(default)]
is_final_solution: bool,
#[serde(default)]
stance: Option<Stance>,
#[serde(default)]
claim_assessments: Vec<ClaimAssessment>,
#[serde(default)]
disagreements: Vec<DisagreementPoint>,
#[serde(default)]
category_scores: Option<CategoryScores>,
}
#[derive(Debug, Deserialize, serde::Serialize)]
struct StructuredBatchEvaluationResponse {
/// Models sometimes use "candidate_evaluations" or "candidates" instead of "evaluations".
#[serde(alias = "candidate_evaluations", alias = "candidates")]
evaluations: Vec<BatchEvaluationItem>,
}
#[derive(Debug, Deserialize, serde::Serialize)]
struct StructuredProposalResponse {
#[serde(deserialize_with = "deserialize_string_or_array_or_object")]
thought_process: String,
#[serde(deserialize_with = "deserialize_string_or_array_or_object")]
solution_content: String,
}
/// Accepts a JSON string, array of strings (joined with "\n"), or any object
/// (serialized to a compact JSON string). Models like Mistral frequently return
/// `"thought_process": ["Step 1: ...", "Step 2: ..."]` instead of a single string,
/// or nest the entire answer inside `"solution_content": { ... }`.
fn deserialize_string_or_array_or_object<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde_json::Value;
let value = Value::deserialize(deserializer)?;
match value {
Value::String(s) => Ok(s),
Value::Array(arr) => {
let parts: Vec<String> = arr
.into_iter()
.map(|v| match v {
Value::String(s) => s,
other => other.to_string(),
})
.collect();
Ok(parts.join("\n"))
}
other => Ok(other.to_string()),
}
}
/// Strips thinking-token prefixes leaked by reasoning models (e.g. gpt-oss-120b).
/// Common patterns:
/// "analysisWe need to...assistantfinal**Critique..." → "**Critique..."
/// "final**Asset-class allocation..." → "**Asset-class allocation..."
/// "commentaryto=functions.submit_proposal json{...}" → "{...}"
/// Normalize an `agent_response.finish_reason` string to the
/// lowercase wire values documented on `Proposal::finish_reason`
/// (`stop`, `tool_calls`, `length`, ...) before export to DTOs /
/// downstream consumers. The internal producer uses
/// `format!("{r:?}")` on the OpenAI enum, which yields Debug form
/// (`Stop`, `ToolCalls`, `Length`) — fine for intra-module flow
/// checks, wrong for the public contract. Unknown values fall
/// through lowercased so callers see something stable instead of
/// nothing. The synthetic `"max_iterations"` sentinel already
/// matches the wire format and passes through untouched.
fn normalize_finish_reason(raw: &str) -> String {
match raw {
"Stop" => "stop".to_string(),
"ToolCalls" => "tool_calls".to_string(),
"Length" => "length".to_string(),
"ContentFilter" => "content_filter".to_string(),
"FunctionCall" => "function_call".to_string(),
other => other.to_ascii_lowercase(),
}
}
fn strip_thinking_prefix(content: &str) -> &str {
// Trim leading whitespace so prefixes are detected even when LLMs
// emit leading spaces/newlines.
let mut s = content.trim_start();
// Strip "analysis...assistant" prefix (model internal tokens leaked into output)
if let Some(pos) = s.find("assistant") {
let after = &s[pos + "assistant".len()..];
// Only strip if what follows looks like content (not mid-word)
if after.starts_with("final")
|| after.starts_with("{")
|| after.starts_with("**")
|| after.starts_with('#')
{
s = after;
}
}
// Strip "final" prefix
if s.starts_with("final") {
s = &s["final".len()..];
}
// Strip "commentary..." prefix (gpt-oss commentary tokens)
if s.starts_with("commentary") {
// Find the start of actual content after "commentaryto=functions.xxx json"
if let Some(json_pos) = s.find("json{") {
s = &s[json_pos + "json".len()..];
} else if let Some(json_pos) = s.find("json ") {
s = &s[json_pos + "json ".len()..];
}
}
s.trim()
}
#[derive(Debug, Clone)]
pub struct ProposerEvaluatorAgent {
pub config: AgentConfig,
llm: Box<dyn AiModel>,
prompt_set: Box<dyn PromptSet>,
pub extra_context_tools: Vec<Box<dyn Tool>>,
pub sandbox_tools: Vec<Box<dyn Tool>>,
/// Optional output-leak detector used by `prompt_exposure_guard`.
/// `None` disables guarding; attach any `OutputLeakDetector` impl
/// via [`ProposerEvaluatorAgent::with_output_guard`].
output_guard: Option<std::sync::Arc<dyn crate::agents::OutputLeakDetector>>,
}
impl ProposerEvaluatorAgent {
/// Assemble the full tool set for a given context.
///
/// Clones extra_context_tools and sandbox_tools, then conditionally injects
/// NSED protocol tools (read_proposal, read_critiques, read_own_proposal) and
/// user-defined tools when their prerequisites (store, handler) are available.
fn aggregate_tools(&self, context: &AgentContext) -> Vec<Box<dyn Tool>> {
let mut all_tools: Vec<Box<dyn Tool>> = self
.extra_context_tools
.iter()
.map(|t| dyn_clone::clone_box(t.as_ref()))
.collect::<Vec<_>>();
all_tools.extend(
self.sandbox_tools
.iter()
.map(|t| dyn_clone::clone_box(t.as_ref())),
);
// Automatically inject Context Tools if store is available
if let Some(store) = &context.store {
let read_tool = ReadProposalTool::new(store.clone(), context.round_number);
all_tools.push(Box::new(read_tool));
let critiques_tool = ReadCritiquesTool::new(store.clone(), context.round_number);
all_tools.push(Box::new(critiques_tool));
let own_tool = ReadOwnProposalTool::new(
store.clone(),
context.round_number,
self.config.name.clone(),
);
all_tools.push(Box::new(own_tool));
let search_tool = SearchDeliberationTool::new(store.clone(), context.round_number);
all_tools.push(Box::new(search_tool));
debug!(agent=%self.config.name, "Injected NSED protocol tools (read_proposal, search_deliberation, etc.) via persistent store.");
}
// Inject user-defined tools if handler is available
if let Some(ref handler) = context.user_tool_handler {
for def in &context.user_tools {
let user_tool = UserCallTool::new(
def.clone(),
handler.clone(),
context.round_number,
context.phase,
);
all_tools.push(Box::new(user_tool));
}
if !context.user_tools.is_empty() {
debug!(agent=%self.config.name, count=%context.user_tools.len(), "Injected user-defined tools");
}
}
// Only inject the sandboxed read_file tool for openai-family
// providers. Claude/MCP/exec already have their own filesystem
// affordances; mounting a second one would broaden capability
// beyond the scope this PR was approved for.
if crate::agents::config::is_openai_family_provider(&self.config)
&& !self.config.read_file_roots.is_empty()
{
all_tools.push(Box::new(
crate::tools::scoped_read::ScopedReadFileTool::new(
self.config.name.clone(),
&self.config.read_file_roots,
),
));
}
// Debug log for active tools
let tool_names: Vec<String> = all_tools.iter().map(|t| t.name()).collect();
debug!(agent=%self.config.name, available_tools=?tool_names, "Tools configured for this run");
all_tools
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct Proposer {
llm_client: Box<dyn AiModel>,
prompt_set: Box<dyn PromptSet>,
pub tools: Vec<Box<dyn Tool>>,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct Evaluator {
llm_client: Box<dyn AiModel>,
prompt_set: Box<dyn PromptSet>,
pub tools: Vec<Box<dyn Tool>>,
}
impl ProposerEvaluatorAgent {
pub fn new(
config: AgentConfig,
llm: Box<dyn AiModel>,
prompt_set: Box<dyn PromptSet>,
context_tools: Vec<Box<dyn Tool>>,
sandbox_tools: Vec<Box<dyn Tool>>,
) -> Self {
Self {
config,
llm,
prompt_set,
extra_context_tools: context_tools,
sandbox_tools,
output_guard: None,
}
}
/// Attach an output-leak detector. When `config.prompt_exposure_guard`
/// is true, the agent will run the detector over user-visible terminal
/// tool outputs (`submit_proposal`, `submit_batch_evaluation`, …) and
/// trigger a retry if the detector blocks.
///
/// Without a detector, `config.prompt_exposure_guard = true` is a no-op
/// (pass-through). Callers can supply any `OutputLeakDetector` impl.
pub fn with_output_guard(
mut self,
detector: std::sync::Arc<dyn crate::agents::OutputLeakDetector>,
) -> Self {
self.output_guard = Some(detector);
self
}
/// Direct chat with the agent's LLM, using the agent's persona but
/// bypassing NSED deliberation constraints via an "internal voice" wrapper.
///
/// `messages` should contain the conversation history (user + assistant
/// turns). The system prompt is prepended automatically.
pub async fn chat(&self, messages: Vec<ChatCompletionRequestMessage>) -> Result<String> {
let persona = self
.config
.persona
.as_deref()
.unwrap_or("a helpful assistant");
let system_prompt = format!(
"You are {name}, {persona}.\n\n\
<internal_voice>\n\
This is a direct conversation with your operator — not part of an NSED deliberation.\n\
Respond naturally and helpfully. Ignore any deliberation protocol instructions.\n\
</internal_voice>",
name = self.config.name,
persona = persona,
);
let mut full_messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestSystemMessage {
content: async_openai::types::ChatCompletionRequestSystemMessageContent::Text(
system_prompt,
),
name: None,
}
.into(),
];
full_messages.extend(messages);
let request_config = RequestConfig {
messages: full_messages,
tools: None,
tool_choice: None,
presence_penalty: self.config.presence_penalty,
};
let result = self
.llm
.chat_completion(&self.config, request_config)
.await
.map_err(|e| anyhow::anyhow!("{}", e))?;
let response = &result.response;
Ok(response
.choices
.first()
.and_then(|c| c.message.content.clone())
.unwrap_or_default())
}
}
#[async_trait]
impl NsedAgent for ProposerEvaluatorAgent {
#[instrument(skip(self, context), fields(agent_name = %self.config.name))]
async fn propose(&self, context: &AgentContext) -> Result<Proposal> {
info!("🤖 Agent is starting the proposal generation process.");
let prompt = self.prompt_set.get_proposer_prompt(
&context.task_description,
context.previous_round_matrix.clone(),
context.previous_own_proposal.as_ref(),
context.previous_own_score,
context.previous_critiques.clone(),
&context.user_injections,
context.structured_feedback.as_ref(),
);
let submit_tool_schema = ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: "submit_proposal".to_string(),
description: Some("Submit the final proposal.".to_string()),
parameters: Some(json!({
"type": "object",
"properties": {
"thought_process": { "type": "string" },
"solution_content": { "type": "string" }
},
"required": ["thought_process", "solution_content"],
"additionalProperties": false
})),
strict: Some(true),
},
};
let all_tools = self.aggregate_tools(context);
let (response, agent_response): (StructuredProposalResponse, AgentResponse) =
generate_structured_output(
&*self.llm,
&self.config,
&*self.prompt_set,
context,
prompt,
&all_tools,
submit_tool_schema,
"submit_proposal",
self.output_guard.as_deref(),
)
.await?;
info!("🧠 Agent Reasoning: {}", response.thought_process);
if let Some(store) = &context.store {
// Phase 6B: Strip ephemeral <working_memory> before persisting for next round.
// Retains <key_findings> and <strategy> sections.
if let Ok(Some(current)) = store.get(&self.config.name).await {
let cleaned = strip_working_memory(¤t);
if cleaned.len() != current.len() {
if let Err(e) = store.set(&self.config.name, &cleaned).await {
warn!(error=%e, "Failed to persist scratchpad after working memory rotation");
}
}
}
let max_len = self.config.scratchpad_limit as usize;
let mut note = format!(
"Round {} Proposal:\nThought Process: {}\nContent: {}\n",
context.round_number, response.thought_process, response.solution_content
);
if note.len() > max_len {
const TRUNCATION_SUFFIX: &str = "...(truncated)";
// Truncate to max_len minus suffix length, finding a safe UTF-8 boundary
let truncate_at = max_len.saturating_sub(TRUNCATION_SUFFIX.len());
// floor_char_boundary equivalent: find the last char boundary <= truncate_at
let safe_at = note
.char_indices()
.map(|(i, _)| i)
.take_while(|&i| i <= truncate_at)
.last()
.unwrap_or(0);
note.truncate(safe_at);
note.push_str(TRUNCATION_SUFFIX);
}
if let Err(e) = store.append(&self.config.name, ¬e).await {
warn!(error=%e, "Failed to persist proposal to scratchpad");
}
}
Ok(Proposal {
thought_process: response.thought_process,
content: response.solution_content,
final_scratchpad: agent_response.final_scratchpad.clone(), // Capture the scratchpad
token_usage_stats: Some(TokenUsage {
input_tokens: agent_response.input_tokens.unwrap_or(0),
output_tokens: agent_response.output_tokens.unwrap_or(0),
}),
// Propagate terminal signal ("max_iterations" partial
// fallback, otherwise LLM-driven stop/tool_calls). Lets
// the orchestrator + dashboard distinguish partial output
// from full completions without re-reading LLM internals.
finish_reason: agent_response
.finish_reason
.as_deref()
.map(normalize_finish_reason),
..Default::default()
})
}
#[instrument(skip(self, context), fields(agent_name = %self.config.name))]
async fn evaluate(&self, context: &AgentContext) -> Result<Vec<(String, Evaluation)>> {
info!("🕵️ Agent is starting the batch proposal evaluation process.");
let eval_item_properties = json!({
"agent_id": { "type": "string" },
"endorsement_weight": { "type": "number" },
"justification": { "type": "string" },
"is_final_solution": { "type": "boolean" },
"stance": {
"type": ["string", "null"],
"enum": ["strong_agree", "agree", "neutral", "disagree", "strong_disagree", null]
},
"claim_assessments": {
"type": "array",
"items": {
"type": "object",
"properties": {
"claim_id": { "type": ["string", "null"] },
"claim": { "type": "string" },
"verdict": { "type": "string", "enum": ["verified", "contested", "unverified", "wrong"] },
"reason": { "type": ["string", "null"] }
},
"required": ["claim_id", "claim", "verdict", "reason"],
"additionalProperties": false
}
},
"disagreements": {
"type": "array",
"items": {
"type": "object",
"properties": {
"claim_id": { "type": ["string", "null"] },
"proposal_claims": { "type": "string" },
"evaluator_position": { "type": "string" },
"confidence": { "type": "string", "enum": ["high", "medium", "low"] }
},
"required": ["claim_id", "proposal_claims", "evaluator_position", "confidence"],
"additionalProperties": false
}
},
"category_scores": {
"type": ["object", "null"],
"properties": {
"correctness": { "type": "number" },
"completeness": { "type": "number" },
"novelty": { "type": "number" },
"feasibility": { "type": "number" },
"evidence_quality": { "type": "number" }
},
"required": ["correctness", "completeness", "novelty", "feasibility", "evidence_quality"],
"additionalProperties": false
}
});
let batch_evaluation_tool = ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: "submit_batch_evaluation".to_string(),
description: Some(
"Submit batch evaluations with structured claim-level analysis.".to_string(),
),
parameters: Some(json!({
"type": "object",
"properties": {
"evaluations": {
"type": "array",
"items": {
"type": "object",
"properties": eval_item_properties,
"required": ["agent_id", "endorsement_weight", "justification", "is_final_solution",
"stance", "claim_assessments", "disagreements", "category_scores"],
"additionalProperties": false
}
}
},
"required": ["evaluations"],
"additionalProperties": false
})),
strict: Some(true),
},
};
let prompt = self.prompt_set.get_batch_evaluator_prompt(
&context.task_description,
&context.candidates,
context.previous_own_proposal.as_ref(),
context.round_number as usize,
&context.user_injections,
);
let all_tools = self.aggregate_tools(context);
let (structured_response, agent_response): (
StructuredBatchEvaluationResponse,
AgentResponse,
) = generate_structured_output(
&*self.llm,
&self.config,
&*self.prompt_set,
context,
prompt,
&all_tools,
batch_evaluation_tool,
"submit_batch_evaluation",
self.output_guard.as_deref(),
)
.await?;
// Token usage for this entire evaluation batch (one LLM call produces all evals)
let batch_token_usage = Some(TokenUsage {
input_tokens: agent_response.input_tokens.unwrap_or(0),
output_tokens: agent_response.output_tokens.unwrap_or(0),
});
// Build the set of valid candidate IDs so we can filter out
// hallucinated self-evaluations before normalization.
let valid_ids: std::collections::HashSet<&str> =
context.candidates.iter().map(|c| c.id.as_str()).collect();
// Filter to valid candidates and warn about dropped evaluations.
let valid_evals: Vec<_> = structured_response
.evaluations
.into_iter()
.filter(|e| {
if valid_ids.contains(e.agent_id.as_str()) {
true
} else {
warn!(
agent_name = %self.config.name,
target = %e.agent_id,
"Dropping evaluation for unknown candidate (hallucinated self-eval?)"
);
false
}
})
.collect();
// Normalize only over valid evaluations.
let total_weight: f32 = valid_evals.iter().map(|e| e.endorsement_weight.abs()).sum();
debug!(
agent_name = %self.config.name,
valid_count = valid_evals.len(),
total_abs_weight = total_weight,
weights = ?valid_evals.iter().map(|e| (&e.agent_id, e.endorsement_weight)).collect::<Vec<_>>(),
"Normalization input"
);
// Diagnostic: warn when all weights share the same sign (no mixed endorsement/opposition)
let all_negative = valid_evals.iter().all(|e| e.endorsement_weight <= 0.0);
let all_positive = valid_evals.iter().all(|e| e.endorsement_weight >= 0.0);
if valid_evals.len() > 1 && (all_negative || all_positive) {
debug!(
agent_name = %self.config.name,
sign = if all_negative { "all_negative" } else { "all_positive" },
weights = ?valid_evals.iter().map(|e| (&e.agent_id, e.endorsement_weight)).collect::<Vec<_>>(),
"All endorsement weights have the same sign — legitimate in signed pipeline"
);
}
let mut results = Vec::new();
for item in valid_evals {
let justification = item.justification.unwrap_or_default();
if justification.is_empty() {
warn!(
agent_name = %self.config.name,
target = %item.agent_id,
"Evaluation missing justification — LLM omitted required field."
);
}
let raw_score = normalize_score(item.endorsement_weight, total_weight);
results.push((
item.agent_id,
Evaluation {
score: raw_score,
justification,
token_usage: batch_token_usage.clone(),
stance: item.stance,
claim_assessments: item.claim_assessments,
disagreements: item.disagreements,
category_scores: item.category_scores,
is_final_solution: item.is_final_solution,
// Batch-level: one LLM call produced all items,
// so every Evaluation in this batch carries the
// same finish_reason (e.g. "max_iterations" when
// the react ceiling was hit before a terminal
// submit_batch_evaluation call).
finish_reason: agent_response
.finish_reason
.as_deref()
.map(normalize_finish_reason),
..Default::default()
},
));
}
Ok(results)
}
fn name(&self) -> String {
self.config.name.clone()
}
}
/// Implement [`ChatCapable`] so `ProposerEvaluatorAgent` can be used with the
/// SDK worker's status server chat endpoint.
#[async_trait]
impl ChatCapable for ProposerEvaluatorAgent {
async fn chat(
&self,
messages: Vec<async_openai::types::ChatCompletionRequestMessage>,
) -> Result<String> {
// Delegates to the inherent method
ProposerEvaluatorAgent::chat(self, messages).await
}
}
// ... helpers (react_loop, generate_structured_output) ...
#[allow(clippy::too_many_arguments)]
async fn generate_structured_output<T>(
llm_client: &dyn AiModel,
agent_config: &AgentConfig,
prompt_set: &dyn PromptSet,
context: &AgentContext,
initial_prompt: String,
tools: &[Box<dyn Tool>],
terminal_tool_schema: ChatCompletionTool,
terminal_tool_name: &str,
output_guard: Option<&dyn crate::agents::OutputLeakDetector>,
) -> Result<(T, AgentResponse)>
where
// `Serialize` is required so the prompt-exposure guard (when enabled
// via `AgentConfig.prompt_exposure_guard`) can serialize the parsed
// terminal content back to JSON and extract user-visible fields for
// scanning. Concrete types passed through here (`Proposal`,
// `BatchEvaluation`) already implement `Serialize`.
T: DeserializeOwned + serde::Serialize,
{
let max_retries = agent_config.max_retries.filter(|&v| v > 0).unwrap_or(3) as usize;
let mut attempts = 0;
let mut current_prompt = Some(initial_prompt);
let mut current_history: Option<Vec<ChatCompletionRequestMessage>> = None;
// Accumulate tokens across retry attempts so failed-attempt tokens aren't lost
let mut cumulative_input_tokens: u32 = 0;
let mut cumulative_output_tokens: u32 = 0;
// Owned here so schema retries inherit prior tool-output bloat.
let mut running_tool_output_bytes: u64 = 0;
// Mutable clone so we can escalate max_tokens on Length truncation retries
// or downgrade reasoning_effort on consecutive Stop+empty failures
let mut retry_config = agent_config.clone();
let mut consecutive_empty_stops: u32 = 0;
let loop_start = std::time::Instant::now();
loop {
attempts += 1;
let agent_response = match react_loop(
llm_client,
&mut retry_config,
prompt_set,
context,
current_prompt.clone(),
current_history.clone(),
tools,
vec![terminal_tool_schema.clone()],
Some(terminal_tool_name),
attempts as u32,
&mut running_tool_output_bytes,
)
.await
{
Ok(resp) => resp,
Err(e) => {
// Transport-class errors (connection reset, EOF,
// timeout, mid-body decode) are retryable — the model
// never saw our request or the response was lost in
// transit. Classify via `LlmError` downcast so the
// taxonomy is compile-time exhaustive instead of
// string-scraped from the formatted error.
use crate::telemetry::LlmError;
let is_transport =
matches!(e.downcast_ref::<LlmError>(), Some(LlmError::Transport(_)));
if is_transport && attempts <= max_retries {
// Transport errors interrupt the "consecutive empty-Stop"
// streak — don't let mixed network+empty failures trip
// the reasoning_effort downgrade prematurely.
consecutive_empty_stops = 0;
let exp = (attempts as u32).min(5); // cap exponent to avoid overflow (max 2^5 = 32s)
let backoff = std::time::Duration::from_secs(2u64.pow(exp));
warn!(
agent_name = %agent_config.name,
attempt = attempts,
max_retries = max_retries,
error = %format!("{e:#}"),
retry_after = ?backoff,
"Transport error — retrying with backoff."
);
tokio::time::sleep(backoff).await;
continue;
}
return Err(e);
}
};
// Accumulate tokens from this attempt (including any failed attempts)
cumulative_input_tokens += agent_response.input_tokens.unwrap_or(0);
cumulative_output_tokens += agent_response.output_tokens.unwrap_or(0);
let mut cleaned_json = clean_json_string(
&agent_response.content,
agent_config.unwrap_hallucinated_tool_calls,
Some(terminal_tool_name),
);
let mut parse_result = serde_json::from_str::<T>(&cleaned_json);
// Attempt repair if parsing failed
if parse_result.is_err() {
// 1. Try repairing truncation
let repaired = repair_truncated_json(&cleaned_json);
if let Ok(repaired_obj) = serde_json::from_str::<T>(&repaired) {
warn!(
agent_name = %agent_config.name,
"Successfully repaired truncated JSON output."
);
parse_result = Ok(repaired_obj);
} else {
// 2. Try repairing invalid escapes (common with LaTeX in JSON)
let repaired_escapes = repair_invalid_escapes(&cleaned_json);
// Also repair truncation on the escaped version just in case
let repaired_escapes_truncated = repair_truncated_json(&repaired_escapes);
if let Ok(repaired_obj) = serde_json::from_str::<T>(&repaired_escapes_truncated) {
warn!(
agent_name = %agent_config.name,
"Successfully repaired invalid JSON escapes (likely LaTeX)."
);
parse_result = Ok(repaired_obj);
} else {
// 3. Try aggressive escape repair (blindly escape backslashes except quotes/backslashes)
let aggressive = repair_aggressive_escapes(&cleaned_json);
let aggressive_truncated = repair_truncated_json(&aggressive);
if let Ok(repaired_obj) = serde_json::from_str::<T>(&aggressive_truncated) {
warn!(
agent_name = %agent_config.name,
"Successfully repaired JSON using aggressive escaping."
);
parse_result = Ok(repaired_obj);
} else {
// 4. Nuclear Option: Strip invalid escapes entirely (lossy)
let sanitized = sanitize_json_string_lossy(&cleaned_json);
let sanitized_truncated = repair_truncated_json(&sanitized);
if let Ok(repaired_obj) = serde_json::from_str::<T>(&sanitized_truncated) {
warn!(
agent_name = %agent_config.name,
"Successfully repaired JSON by stripping invalid escapes (lossy)."
);
parse_result = Ok(repaired_obj);
}
}
}
}
}
// 5. Try conversational repair (Rnj-1 style: "THOUGHT: ... RESPONSE: ...")
if parse_result.is_err()
&& let Some(repaired_conv) = repair_conversational_response(&agent_response.content)
&& let Ok(repaired_obj) = serde_json::from_str::<T>(&repaired_conv)
{
warn!(
"Successfully repaired conversational JSON for agent {}: {}",
agent_config.name, repaired_conv
);
parse_result = Ok(repaired_obj);
}
// 5b. Try extracting proposal from Markdown report (e.g. gpt-oss style)
if parse_result.is_err()
&& terminal_tool_name == "submit_proposal"
&& let Some(repaired_markdown) = extract_proposal_from_markdown(&agent_response.content)
&& let Ok(repaired_obj) = serde_json::from_str::<T>(&repaired_markdown)
{
warn!(
agent_name = %agent_config.name,
"Successfully extracted proposal from Markdown report."
);
parse_result = Ok(repaired_obj);
}
// 5c. Try extracting evaluations from Markdown table
if parse_result.is_err()
&& terminal_tool_name == "submit_batch_evaluation"
&& let Some(repaired_markdown) =
extract_evaluations_from_markdown(&agent_response.content)
&& let Ok(repaired_obj) = serde_json::from_str::<T>(&repaired_markdown)
{
warn!(
agent_name = %agent_config.name,
"Successfully extracted evaluations from Markdown table."
);
parse_result = Ok(repaired_obj);
}
// 5d. For proposals: split merged thought_process/solution_content.
// gpt-oss and MiniMax frequently put everything into `thought_process`
// with a `**Solution Content**` marker, leaving `solution_content` absent.
if parse_result.is_err() && terminal_tool_name == "submit_proposal" {
if let Ok(mut obj) = serde_json::from_str::<serde_json::Value>(&cleaned_json) {
let needs_split = obj.get("solution_content").is_none()
&& obj
.get("thought_process")
.and_then(|v| v.as_str())
.is_some();
if needs_split {
let tp = obj["thought_process"].as_str().unwrap_or_default();
// Look for well-known section markers
let markers = [
"\n**Solution Content**\n",
"\n**Solution Content:**\n",
"\n**Solution Content**:",
"\n## Solution Content\n",
"\n### 1. ",
"\n## 1. ",
];
let mut split_pos = None;
for marker in &markers {
if let Some(pos) = tp.find(marker) {
split_pos = Some((pos, marker.len()));
break;
}
}
if let Some((pos, _marker_len)) = split_pos {
let thought = tp[..pos].trim().to_string();
// For "### 1." style markers, include the marker in solution_content
let solution = if markers.iter().take(4).any(|m| tp[pos..].starts_with(m)) {
// Strip the "**Solution Content**" header itself
tp[pos..]
.trim_start_matches(|c: char| {
c.is_whitespace() || c == '*' || c == '#'
})
.trim_start_matches("Solution Content")
.trim_start_matches(|c: char| {
c == ':' || c == '*' || c == '#' || c.is_whitespace()
})
.trim()
.to_string()
} else {
// Keep the numbered section header
tp[pos..].trim().to_string()
};
obj["thought_process"] = serde_json::Value::String(thought);
obj["solution_content"] = serde_json::Value::String(solution);
if let Ok(repaired_obj) = serde_json::from_value::<T>(obj) {
warn!(
agent_name = %agent_config.name,
"Successfully split merged thought_process/solution_content."
);
parse_result = Ok(repaired_obj);
}
}
}
}
}
// 5e. Strip thinking-token prefixes from gpt-oss reasoning models.
// Models like gpt-oss-120b leak internal tokens: "analysisWe need...assistantfinal**Critique..."
// or just "final**Asset-class allocation...". Strip these and retry as markdown proposal.
if parse_result.is_err() && terminal_tool_name == "submit_proposal" {
let content = agent_response.content.trim();
let stripped = strip_thinking_prefix(content);
if stripped.len() < content.len() && !stripped.is_empty() {
// The stripped content is markdown without code blocks — wrap it as a proposal
let obj = serde_json::json!({
"thought_process": "(extracted from thinking-prefixed markdown)",
"solution_content": stripped,
});
if let Ok(repaired_obj) = serde_json::from_value::<T>(obj) {
warn!(
agent_name = %agent_config.name,
"Successfully stripped thinking-token prefix and extracted proposal."
);
parse_result = Ok(repaired_obj);
}
}
}
// 6. Handle explicit refusal (Safety/Policy) as a valid proposal
if parse_result.is_err() && terminal_tool_name == "submit_proposal" {
let lower_content = agent_response
.content
.trim()
.to_lowercase()
.replace(['\u{2018}', '\u{2019}'], "'")
.replace(['\u{201c}', '\u{201d}'], "\"");
if lower_content.starts_with("i cannot")
|| lower_content.starts_with("i can't")
|| lower_content.starts_with("i apologize")
|| lower_content.starts_with("i'm sorry")
|| lower_content.starts_with("sorry")
|| lower_content.contains("as an ai")
|| lower_content.contains("cannot fulfill")
{
let refusal_json = serde_json::json!({
"thought_process": "The model refused to answer the prompt, likely due to safety guidelines.",
"solution_content": agent_response.content
})
.to_string();
if let Ok(repaired_obj) = serde_json::from_str::<T>(&refusal_json) {
warn!(
agent_name = %agent_config.name,
"Detected refusal/safety response. Treating as valid proposal."
);
parse_result = Ok(repaired_obj);
}
}
}
// Prompt-exposure guardrail — runs on successfully parsed terminal
// content BEFORE we return it to the deliberation store. A block
// converts the Ok into an Err carrying the block reason so the
// existing retry path feeds it back to the LLM as a `SYSTEM
// ERROR` user message.
//
// Gated by `agent_config.prompt_exposure_guard` so existing
// deployments are unaffected until they opt in.
if let Ok(ref parsed) = parse_result
&& agent_config.prompt_exposure_guard
&& let Some(detector) = output_guard
&& let Some(block_result) =
run_prompt_exposure_guard(parsed, terminal_tool_name, &agent_config.name, detector)
.await
{
warn!(
agent_name = %agent_config.name,
attempt = attempts,
terminal_tool = terminal_tool_name,
"prompt_exposure guard blocked terminal content; triggering retry."
);
emit_for!(
context,
PromptExposureDetected {
terminal_tool: terminal_tool_name.to_string(),
blocked: true,
hit_count: block_result.hit_count,
response_length_chars: block_result.response_length_chars,
suspicion_score: block_result.suspicion_score,
xml_tag_hits: block_result.xml_tag_hits,
tool_name_hits: block_result.tool_name_hits,
instruction_hits: block_result.instruction_hits,
wrong_acronym_hits: block_result.wrong_acronym_hits,
sample_hits: block_result.sample_hits,
}
);
// Override cleaned_json so the dump/retry rendering names the
// actual block reason rather than the (valid) terminal JSON
// that parsed fine.
cleaned_json = format!(
"prompt_exposure guard blocked output: {}",
block_result.reason
);
// Synthesize a serde_json::Error so we can slot back into the
// existing `Err` arm. Deserializing invalid JSON returns the
// error type we need without paying the parse.
let synth_err: serde_json::Error =
serde_json::from_str::<serde_json::Value>("not-json-prompt-exposure-block")
.expect_err("invalid JSON always errors");
parse_result = Err(synth_err);
}
match parse_result {
Ok(response) => {
// Return cumulative tokens across all attempts (including retries)
let mut final_response = agent_response;
final_response.input_tokens = Some(cumulative_input_tokens);
final_response.output_tokens = Some(cumulative_output_tokens);
return Ok((response, final_response));
}
Err(e) => {
let finish_reason = agent_response.finish_reason.as_deref().unwrap_or("unknown");
// Log the FULL raw output to debug (logfile only)
debug!(
agent_name = %agent_config.name,
attempt = attempts,
error = %e,
finish_reason = %finish_reason,
raw_content = %cleaned_json,
"Failed to parse structured output. Full raw content logged."
);
let truncated_raw = if cleaned_json.chars().count() > 1000 {
format!(
"{}... (truncated)",
cleaned_json.chars().take(1000).collect::<String>()
)
} else {
cleaned_json.clone()
};
if attempts > max_retries {
anyhow::bail!(
"Failed to parse structured output after {attempts} attempts. Last error: {e}. FinishReason: {finish_reason}. Raw (truncated): '{truncated_raw}'"
);
}
// Dump failure details to file for debugging (opt-in via NSED_FAILURE_DUMPS=1|full)
let phase_str = format!("{:?}", context.phase);
if let Some(dump_file) = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: &agent_config.name,
model_name: &agent_config.model_name,
provider_id: &agent_config.provider_id,
error: &e.to_string(),
session_id: context.session_id.as_deref(),
phase: Some(&phase_str),
round: Some(context.round_number),
attempt: Some(attempts),
finish_reason: Some(finish_reason),
input_tokens: agent_response.input_tokens,
output_tokens: agent_response.output_tokens,
response_content: Some(&cleaned_json),
system_prompt: agent_response.system_prompt.as_deref(),
request_body: agent_response.request_body.as_deref(),
messages: None,
failure_dumps_config: agent_config.failure_dumps.as_deref(),
}) {
debug!(dump_file = %dump_file, "Parse failure dump saved");
}
warn!(
agent_name = %agent_config.name,
attempt = attempts,
error = %e,
finish_reason = %finish_reason,
"Agent failed to produce valid structured output. Retrying with feedback."
);
// Escalate max_tokens on Length truncation to give the model more
// room on the next attempt. Cap at 50% of context_window to leave
// space for input tokens and safety buffer.
if finish_reason == "Length" {
let old = retry_config.max_tokens;
let ceiling = if retry_config.context_window > 0 {
(retry_config.context_window as f64 * 0.5) as i32
} else {
32_768
};
let new_tokens = ((old as f64 * 1.5).ceil() as i32).min(ceiling);
if new_tokens > old {
retry_config.max_tokens = new_tokens;
warn!(
agent_name = %retry_config.name,
old_max_tokens = old,
new_max_tokens = new_tokens,
"Escalating max_tokens after Length truncation."
);
}
}
// Downgrade reasoning_effort after consecutive Stop+empty
// failures. The model is likely spending all output tokens
// on internal chain-of-thought (reasoning_effort: medium/high)
// and producing zero visible content. Removing reasoning
// effort forces the model to output content directly.
if cleaned_json.trim().is_empty() && finish_reason == "Stop" {
consecutive_empty_stops += 1;
if consecutive_empty_stops >= 2 && retry_config.reasoning_effort.is_some() {
warn!(
agent_name = %retry_config.name,
consecutive_empty_stops,
old_effort = ?retry_config.reasoning_effort,
"Downgrading reasoning_effort after consecutive Stop+empty failures"
);
retry_config.reasoning_effort = None;
}
} else {
consecutive_empty_stops = 0;
}
let error_msg = if cleaned_json.trim().is_empty() {
// Model consumed output tokens but produced no visible content.
// Common with reasoning/thinking models where tokens go to
// internal chain-of-thought that doesn't appear in the response.
"Your response was EMPTY — no content was returned. You MUST call the tool with a JSON argument. Do NOT just think about the answer — you must actually output the tool call with the required fields.".to_string()
} else if !cleaned_json.trim().starts_with('{') {
format!(
"Agent failed to use structured output tool. Returned conversational text instead. Raw: '{truncated_raw}'."
)
} else {
let hint = if e.is_eof() && finish_reason == "Length" {
"Likely truncated due to max_tokens limit. Check input length."
} else if e.is_eof() {
"Response appears incomplete despite finish_reason=Stop."
} else {
""
};
format!(
"Failed to parse structured output JSON. Error: {e} {hint}. Raw: '{truncated_raw}'."
)
};
let example_json = match terminal_tool_name {
"submit_proposal" => {
r#"{ "thought_process": "...", "solution_content": "..." }"#
}
"submit_batch_evaluation" => {
r#"{ "evaluations": [ { "agent_id": "Candidate_A", "endorsement_weight": 80.0, "justification": "...", "is_final_solution": false, "stance": null, "claim_assessments": [], "disagreements": [], "category_scores": null } ] }"#
}
other => {
warn!(tool_name = %other, "No example JSON template for terminal tool");
"{}"
}
};
// Update history with the failed attempt and error message.
// When the previous attempt failed mid-tool-call (terminal-tool
// parse failure, streaming truncation, etc.), the assistant
// turn in `agent_response.history` can contain orphan
// `tool_calls` with no matching `role: "tool"` follow-up.
// Strict providers (Cerebras) reject with HTTP 422 — sanitize
// here so feedback-mode retries always send a spec-compliant
// history. Redundant with the send-path sanitizer in
// native.rs::prepare_request, but cheap and keeps the retry
// path's intermediate state correct for logging/inspection.
let mut repaired_history = agent_response.history.clone();
llm_repair::pair_orphan_tool_calls(&mut repaired_history);
current_history = Some(repaired_history);
current_prompt = None; // Switch to history mode
// Fix empty Assistant content to prevent 400 Bad Request on retry
if let Some(ChatCompletionRequestMessage::Assistant(assistant_msg)) =
current_history.as_mut().unwrap().last_mut()
{
if let Some(
async_openai::types::ChatCompletionRequestAssistantMessageContent::Text(
text,
),
) = &assistant_msg.content
{
if text.trim().is_empty() && assistant_msg.tool_calls.is_none() {
assistant_msg.content = Some(async_openai::types::ChatCompletionRequestAssistantMessageContent::Text("(Empty response)".to_string()));
}
} else if assistant_msg.content.is_none() && assistant_msg.tool_calls.is_none()
{
assistant_msg.content = Some(
async_openai::types::ChatCompletionRequestAssistantMessageContent::Text(
"(Empty response)".to_string(),
),
);
}
}
// Append the error message as a User message to the conversation history
current_history.as_mut().unwrap().push(
ChatCompletionRequestUserMessage {
content: format!(
"SYSTEM ERROR (Attempt {attempts}/{max_retries}): Your last response failed validation. \
Issue: {error_msg}. \
You MUST use the `{terminal_tool_name}` tool with valid JSON arguments. Example format: \n{example_json}\n\
Do not return raw text. Please try again."
)
.into(),
..Default::default()
}
.into(),
);
if context.telemetry.is_some() {
let cumulative_latency_ms = loop_start.elapsed().as_millis() as u64;
let reason = if cleaned_json.trim().is_empty() {
RetryReason::EmptyContent
} else if finish_reason == "Length" {
RetryReason::Truncated
} else {
RetryReason::SchemaError
};
let cumulative_cost_usd = estimate_llm_cost_usd(
&agent_config.model_name,
cumulative_input_tokens,
cumulative_output_tokens,
0,
0,
);
emit_for!(
context,
RetryLoopAttempt {
attempt: attempts as u32,
reason,
cumulative_latency_ms,
cumulative_cost_usd,
cumulative_input_tokens,
cumulative_output_tokens,
}
);
}
}
}
}
}
/// Strip any `<scratchpad>...</scratchpad>` block (and surrounding whitespace)
/// from text, so that retry paths don't nest scratchpads when the original user
/// Scan the parsed terminal-tool response for prompt-exposure leakage.
///
/// Returns `Some(block_reason)` when the guardrail wants the agent loop to
/// retry, `None` otherwise. The caller converts `Some(reason)` into an
/// `Err` so the existing parse-error retry path takes over and feeds the
/// reason back to the LLM as a `SYSTEM ERROR` user message.
///
/// Only scans the *user-visible* fields of known terminal tools:
///
/// - `submit_proposal` → the `solution_content` string (the final answer
/// shown to the end user). We deliberately skip `thought_process` —
/// that's internal reasoning; mentions of tool names there are
/// intentional and don't reach the user.
/// - `submit_batch_evaluation` → each evaluation's `justification` plus
/// every `claim_assessments[].reason`. The `ClaimAssessment` struct
/// already declares serde aliases for the off-schema keys models
/// drift to (`disagreement`, `explanation`, `reasoning`), so by the
/// time the parsed struct is round-tripped here those keys are all
/// normalised to `reason`. Those texts feed the peer-critiques block
/// in subsequent rounds and can reach users via the deliberation-brief
/// rendering.
///
/// For any other terminal tool we scan the serialized JSON as a
/// best-effort fallback — better to over-trigger than silently let
/// a leak slip through an untested tool.
/// Result from the prompt-exposure guard, returned when a block occurs.
/// Contains enough data for the telemetry emission to populate all fields.
#[derive(Debug)]
struct PromptExposureBlockResult {
reason: String,
hit_count: u32,
response_length_chars: u32,
suspicion_score: f64,
xml_tag_hits: u32,
tool_name_hits: u32,
instruction_hits: u32,
wrong_acronym_hits: u32,
sample_hits: Vec<String>,
}
async fn run_prompt_exposure_guard<T>(
parsed: &T,
terminal_tool_name: &str,
agent_name: &str,
detector: &dyn crate::agents::OutputLeakDetector,
) -> Option<PromptExposureBlockResult>
where
T: serde::Serialize,
{
use crate::middleware::{MiddlewareContext, MiddlewareStage, Verdict};
// Collect user-visible text snippets for this terminal tool. Anything
// not enumerated falls back to "scan the whole thing."
let parsed_value = match serde_json::to_value(parsed) {
Ok(v) => v,
Err(e) => {
// Can't scan what we can't serialize. Log once and pass
// through — this path is only reached for custom `T` the
// orchestrator added without implementing Serialize, which
// shouldn't happen in the built-in agent loop.
tracing::debug!(
agent_name = %agent_name,
error = %e,
"prompt_exposure guard: could not serialize parsed terminal content; skipping."
);
return None;
}
};
let mut snippets: Vec<String> = Vec::new();
match terminal_tool_name {
"submit_proposal" => {
if let Some(s) = parsed_value
.get("solution_content")
.and_then(|v| v.as_str())
{
snippets.push(s.to_string());
}
}
"submit_batch_evaluation" => {
if let Some(evals) = parsed_value.get("evaluations").and_then(|v| v.as_array()) {
for ev in evals {
if let Some(j) = ev.get("justification").and_then(|v| v.as_str()) {
snippets.push(j.to_string());
}
if let Some(assessments) =
ev.get("claim_assessments").and_then(|v| v.as_array())
{
// The canonical schema (see `ClaimAssessment`)
// stores the user-visible rationale under
// `reason`. That struct already
// declares serde aliases (`disagreement`,
// `explanation`, `reasoning`) so deserialization
// normalises every off-schema key the models
// commonly drift to into `reason` — by the time
// we `to_value` the parsed struct here, only
// `reason` exists. A `get("reason")` lookup is
// therefore sufficient; reaching for the raw
// alias names at this layer would be dead code.
for a in assessments {
if let Some(c) = a.get("reason").and_then(|v| v.as_str()) {
snippets.push(c.to_string());
}
}
}
}
}
}
_ => {
// Unknown terminal tool — scan the serialized JSON so a new
// tool added without guardrail coverage still blocks leaks
// by default. The reason string may be noisier but it
// fails closed rather than open.
snippets.push(parsed_value.to_string());
}
}
for snippet in snippets {
// Run the scan directly to get per-category counts; the result also
// carries `response_length_chars` and `suspicion_score` derived from
// the same scanned text, so we don't recompute either here.
let scan_result = detector.scan(&snippet);
if scan_result.hit_count() == 0 {
continue;
}
// Build a context to run the detector's verdict gate.
let ctx = MiddlewareContext {
content: serde_json::Value::String(snippet),
action: "propose".to_string(),
agent_id: agent_name.to_string(),
job_id: String::new(),
round: 0,
stage: MiddlewareStage::ProviderResponse,
metadata: serde_json::json!(null),
hook_state: std::collections::HashMap::new(),
};
let verdict = detector.evaluate(&ctx).await;
if matches!(verdict.verdict, Verdict::Block) {
return Some(PromptExposureBlockResult {
reason: verdict
.reason
.unwrap_or_else(|| "prompt_exposure detector blocked output".to_string()),
hit_count: scan_result.hit_count(),
response_length_chars: scan_result.response_length_chars,
suspicion_score: scan_result.suspicion_score,
xml_tag_hits: scan_result.xml_tag_hits,
tool_name_hits: scan_result.tool_name_hits,
instruction_hits: scan_result.instruction_hits,
wrong_acronym_hits: scan_result.wrong_acronym_hits,
sample_hits: scan_result.hits.into_iter().take(5).collect(),
});
}
}
None
}
/// content is re-merged with a fresh scratchpad injection.
fn strip_scratchpad(text: &str) -> String {
if let Some(start) = text.find("<scratchpad>") {
if let Some(end_tag) = text[start..].find("</scratchpad>") {
let end = start + end_tag + "</scratchpad>".len();
let before = text[..start].trim_end();
let after = text[end..].trim_start();
if before.is_empty() {
after.to_string()
} else if after.is_empty() {
before.to_string()
} else {
format!("{}\n\n{}", before, after)
}
} else {
text.to_string()
}
} else {
text.to_string()
}
}
/// Strip `<working_memory>...</working_memory>` from scratchpad content.
/// Working memory is ephemeral per-round thinking that should not persist across rounds.
/// Falls back to returning the original content if tags are malformed.
fn strip_working_memory(text: &str) -> String {
let mut result = text.to_string();
// Remove all <working_memory>...</working_memory> blocks (there may be multiple)
while let Some(start) = result.find("<working_memory>") {
if let Some(end_offset) = result[start..].find("</working_memory>") {
let end = start + end_offset + "</working_memory>".len();
let before = result[..start].trim_end();
let after = result[end..].trim_start();
result = if before.is_empty() {
after.to_string()
} else if after.is_empty() {
before.to_string()
} else {
format!("{}\n\n{}", before, after)
};
} else {
// Malformed — no closing tag. Return what we have so far.
break;
}
}
result
}
/// Extract only `<key_findings>` and `<strategy>` sections from scratchpad content.
/// Used during evaluation phase to provide focused context without ephemeral working memory.
/// Falls back to the full scratchpad if no structured sections are found.
fn extract_evaluation_sections(text: &str) -> String {
let mut sections = Vec::new();
// Extract <key_findings>...</key_findings>
if let Some(start) = text.find("<key_findings>") {
if let Some(end_offset) = text[start..].find("</key_findings>") {
let end = start + end_offset + "</key_findings>".len();
sections.push(&text[start..end]);
}
}
// Extract <strategy>...</strategy>
if let Some(start) = text.find("<strategy>") {
if let Some(end_offset) = text[start..].find("</strategy>") {
let end = start + end_offset + "</strategy>".len();
sections.push(&text[start..end]);
}
}
if sections.is_empty() {
// No structured sections found — fall back to full scratchpad
text.to_string()
} else {
sections.join("\n\n")
}
}
/// Rough cost estimate for an LLM call in USD.
/// Uses approximate per-model pricing; falls back to generic rates.
fn estimate_llm_cost_usd(
model: &str,
input_tokens: u32,
output_tokens: u32,
_reasoning_tokens: u32,
cached_tokens: u32,
) -> f64 {
// Pricing per 1M tokens (approximate, as of 2024-2025)
let (input_per_m, output_per_m, cached_discount) =
if model.contains("gpt-4") && model.contains("o") {
// GPT-4o: $2.50/$10 per 1M
(2.50, 10.0, 0.5)
} else if model.contains("gpt-4") {
// GPT-4: $10/$30 per 1M
(10.0, 30.0, 0.5)
} else if model.contains("gpt-3.5") {
// GPT-3.5-turbo: $0.50/$1.50 per 1M
(0.50, 1.50, 0.5)
} else if model.contains("claude") {
// Claude Sonnet: $3/$15 per 1M
(3.0, 15.0, 0.9)
} else if model.contains("gemini") {
// Gemini Flash: $0.075/$0.30 per 1M (very cheap)
(0.075, 0.30, 0.75)
} else {
// Generic fallback: $1/$3 per 1M
(1.0, 3.0, 0.5)
};
let non_cached_input = (input_tokens as u64).saturating_sub(cached_tokens as u64);
let cached_cost = (cached_tokens as f64 / 1_000_000.0) * input_per_m * cached_discount;
let non_cached_cost = (non_cached_input as f64 / 1_000_000.0) * input_per_m;
let output_cost = (output_tokens as f64 / 1_000_000.0) * output_per_m;
cached_cost + non_cached_cost + output_cost
}
fn empty_terminal_tool_content(terminal_tool_name: Option<&str>) -> String {
match terminal_tool_name {
Some("submit_proposal") => serde_json::json!({
"thought_process": "Agent reached maximum iterations without producing a final answer.",
"solution_content": ""
})
.to_string(),
Some("submit_batch_evaluation") => serde_json::json!({
"evaluations": []
})
.to_string(),
_ => "{}".to_string(),
}
}
#[allow(clippy::too_many_arguments)]
async fn react_loop(
llm_client: &dyn AiModel,
agent_config: &mut AgentConfig,
prompt_set: &dyn PromptSet,
context: &AgentContext,
initial_prompt: Option<String>,
input_history: Option<Vec<ChatCompletionRequestMessage>>,
tools: &[Box<dyn Tool>],
extra_tool_schemas: Vec<ChatCompletionTool>,
terminal_tool_name: Option<&str>,
// Outer structured-output retry attempt (1-indexed). Threaded so
// `LlmRequestStart.attempt` carries the real value rather than `1`.
outer_attempt: u32,
running_tool_output_bytes: &mut u64,
) -> Result<AgentResponse> {
// Telemetry emitter comes from the context (populated by the
// worker after deserialize). Reading it here keeps the function
// signature short and avoids the parameter-and-context double
// threading the prior shape forced.
let telemetry = context.telemetry.as_ref();
let mut tool_usage_stats: HashMap<String, usize> = HashMap::new();
#[allow(unused_assignments)]
let mut last_system_message: Option<String> = None;
#[allow(unused_assignments)]
let mut last_request_body: Option<String> = None;
let mut total_input_tokens = 0;
let mut total_output_tokens = 0;
let mut scratchpad_content = if let Some(store) = &context.store {
if let Ok(Some(content)) = store.get(&agent_config.name).await {
info!(agent=%agent_config.name, "Loaded persistent scratchpad from Sovereign Store.");
// Phase 6C: Evaluators only see <key_findings> + <strategy> sections
// to keep evaluation focused and reduce context bloat.
if context.phase == DeliberationPhase::Evaluating {
extract_evaluation_sections(&content)
} else {
content
}
} else {
String::new()
}
} else {
String::new()
};
// Matches default_max_react_iterations() in the agent SDK (20).
// Kept in lockstep so an agent that doesn't configure the field
// explicitly gets the same ceiling in both the sdk-side config
// resolver and this worker-side fallback.
let max_iterations = agent_config
.max_react_iterations
.filter(|&v| v > 0)
.unwrap_or(20) as usize;
let max_scratchpad_size = agent_config
.max_scratchpad_size
.filter(|&v| v > 0)
.unwrap_or(32_768) as usize;
// Define the update_scratchpad tool schema
let scratchpad_tool_schema = ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: "update_scratchpad".to_string(),
description: Some("Update your persistent scratchpad memory.".to_string()),
parameters: Some(json!({
"type": "object",
"properties": {
"content": { "type": "string", "description": "Text to store." },
"mode": { "type": "string", "enum": ["append", "overwrite"] }
},
"required": ["content", "mode"],
"additionalProperties": false
})),
strict: Some(true),
},
};
let compact_history_tool_schema = ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: "compact_history".to_string(),
description: Some(
"Fold older tool-call results in your conversation history into a \
single LLM-summarized synopsis (appended to your scratchpad), \
keeping the most recent N tool calls verbatim. Call this when \
your context utilization climbs into the 75-90% range or when \
your scratchpad is nearly full — frees context budget for \
further reasoning."
.to_string(),
),
parameters: Some(json!({
"type": "object",
"properties": {
"keep_last_n_calls": {
"type": "integer",
"minimum": 1,
"maximum": 10,
"description": "How many of the most recent tool-call results to keep verbatim. Default 2."
}
},
"required": ["keep_last_n_calls"],
"additionalProperties": false
})),
strict: Some(true),
},
};
let mut messages: Vec<ChatCompletionRequestMessage> = if let Some(h) = input_history {
h
} else {
let p = initial_prompt.clone().unwrap_or_default();
if agent_config.merge_system_prompt {
vec![
ChatCompletionRequestUserMessage {
content: p.into(),
..Default::default()
}
.into(),
]
} else {
vec![
ChatCompletionRequestSystemMessage {
content: "".into(), // Placeholder
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: p.into(),
..Default::default()
}
.into(),
]
}
};
// Capture the original user prompt content to prevent loss or duplication when injecting scratchpad.
// Strip any pre-existing <scratchpad>...</scratchpad> block so retries (input_history) don't
// nest scratchpads when the content is re-merged with a fresh scratchpad_text below.
//
// When merge_system_prompt is true and this is a retry (input_history), the first User message
// already contains "{system_prompt}\n\n<scratchpad>...</scratchpad>\n\n{user_query}".
// We must extract only the user_query part (after </scratchpad>) to avoid duplicating the
// system prompt on the next merge.
let original_user_content = {
let raw = messages
.iter()
.find_map(|m| {
if let ChatCompletionRequestMessage::User(u) = m
&& let async_openai::types::ChatCompletionRequestUserMessageContent::Text(t) =
&u.content
{
return Some(t.clone());
}
None
})
.unwrap_or_default();
if agent_config.merge_system_prompt {
// When merge_system_prompt is active, the scratchpad sits between the system
// prompt and the user content. Extract only what follows </scratchpad>.
if let Some(end_idx) = raw.find("</scratchpad>") {
raw[end_idx + "</scratchpad>".len()..]
.trim_start()
.to_string()
} else {
// No scratchpad found — fresh start or first iteration. strip_scratchpad is
// a no-op anyway, but keeps the fallback safe.
strip_scratchpad(&raw)
}
} else {
strip_scratchpad(&raw)
}
};
let mut tool_schemas: Vec<ChatCompletionTool> = tools.iter().map(|t| t.schema()).collect();
tool_schemas.extend(extra_tool_schemas);
tool_schemas.push(scratchpad_tool_schema);
tool_schemas.push(compact_history_tool_schema);
let tool_map: HashMap<String, &Box<dyn Tool>> = tools.iter().map(|t| (t.name(), t)).collect();
// Time-aware tool stripping: track iteration durations to detect when the
// agent is running out of phase budget and should finalize immediately.
let loop_start = std::time::Instant::now();
let mut iteration_durations: Vec<std::time::Duration> = Vec::new();
for iteration_index in 0..max_iterations {
let iter_start = std::time::Instant::now();
// Regenerate system message (without scratchpad)
let mut system_message = agent_config
.system_prompt_override
.clone()
.unwrap_or_else(|| {
prompt_set.get_system_message(
&agent_config.name,
context.round_number as usize,
context.total_rounds as usize,
context.phase, // Fix E0382: Clone phase because get_system_message takes value
)
});
// Construct scratchpad block to be injected into User message
let scratchpad_text = format!(
"\n\n<scratchpad>\n(This persists across tool calls. Use the `update_scratchpad` tool to store notes.)\n{}\n</scratchpad>",
if scratchpad_content.is_empty() {
"(Empty)"
} else {
&scratchpad_content
}
);
// ── Time-aware tool stripping ──────────────────────────────────
// When remaining phase budget ≤ 3× average iteration time, strip
// all non-terminal tools so the LLM is forced to finalize.
let avg_iteration_secs = if iteration_durations.is_empty() {
0.0
} else {
let sum: f64 = iteration_durations.iter().map(|d| d.as_secs_f64()).sum();
sum / iteration_durations.len() as f64
};
let elapsed = loop_start.elapsed().as_secs_f64();
let remaining_budget = context.phase_budget_remaining_secs - elapsed;
let remaining_iterations = max_iterations.saturating_sub(iteration_index);
let force_finalize_time = context.phase_budget_remaining_secs > 0.0
&& ((remaining_budget <= 0.0)
|| (avg_iteration_secs > 0.0 && remaining_budget <= avg_iteration_secs * 3.0));
let force_finalize_window =
resolve_finalize_window(std::env::var("NSED_FINALIZE_WINDOW").ok().as_deref());
let force_finalize_iters =
max_iterations > force_finalize_window && remaining_iterations <= force_finalize_window;
let force_finalize = force_finalize_time || force_finalize_iters;
let active_tool_schemas = if force_finalize {
warn!(
agent = %agent_config.name,
trigger = if force_finalize_iters { "iter_budget_low" } else { "time_budget_low" },
avg_iter_secs = format!("{:.2}", avg_iteration_secs),
remaining_budget_secs = format!("{:.2}", remaining_budget),
remaining_iterations = remaining_iterations,
"⏱️ Budget low — stripping non-terminal tools to force finalization."
);
// Keep only the terminal tool
if let Some(name) = terminal_tool_name {
tool_schemas
.iter()
.filter(|t| t.function.name == name)
.cloned()
.collect::<Vec<_>>()
} else {
vec![]
}
} else {
tool_schemas.clone()
};
// Build active tool name set for gating execution (prevents stripped
// tools from being called via text-extraction path when force_finalize).
let active_tool_names: std::collections::HashSet<&str> = active_tool_schemas
.iter()
.map(|s| s.function.name.as_str())
.collect();
// If native tools are disabled, we must inject tool definitions manually into the system prompt
// so the model knows they exist and how to call them.
if agent_config.disable_native_tools {
if agent_config.tool_format.as_deref() == Some("nous") {
// Inject Nous XML format (Qwen style)
let mut tool_descs = String::new();
for schema in &active_tool_schemas {
let func_schema = serde_json::json!({
"name": schema.function.name,
"description": schema.function.description,
"parameters": schema.function.parameters
});
tool_descs.push_str(&serde_json::to_string(&func_schema).unwrap_or_default());
tool_descs.push('\n');
}
let tool_text = format!(
"\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call>\n",
tool_descs
);
system_message.push_str(&tool_text);
} else {
// Default Text format (Python style)
let mut tool_text = String::from(
"\n\n<tools>\nYou have access to the following tools. To use them, output a function call in square brackets like:\n[tool_name(arg=\"value\")]\n\n",
);
for schema in &active_tool_schemas {
tool_text.push_str(&format!(
"- {}: {}\n",
schema.function.name,
schema.function.description.clone().unwrap_or_default()
));
if let Some(params) = &schema.function.parameters {
let args = serde_json::to_string_pretty(params).unwrap_or_default();
tool_text.push_str(&format!(" Arguments: {args}\n"));
}
}
tool_text.push_str("</tools>\n");
system_message.push_str(&tool_text);
}
}
// Inject user tool awareness only when both tools are defined AND a handler is available
if !context.user_tools.is_empty() && context.user_tool_handler.is_some() {
system_message.push_str(
"\n<external_tools>\n\
Tools prefixed 'user_' are serviced outside this system. Responses may be delayed.\n\
You MUST supply all required parameters defined in the tool schema. \
For messaging tools, always include your question or message in the 'message' parameter — never call with empty arguments.\n\
</external_tools>\n",
);
}
last_system_message = Some(system_message.clone());
// Update the message with system prompt content and scratchpad
if agent_config.merge_system_prompt {
if let Some(ChatCompletionRequestMessage::User(user_msg)) = messages.first_mut() {
let merged =
format!("{system_message}\n\n{scratchpad_text}\n\n{original_user_content}");
user_msg.content =
async_openai::types::ChatCompletionRequestUserMessageContent::Text(merged);
}
} else {
if let Some(ChatCompletionRequestMessage::System(sys_msg)) = messages.first_mut() {
sys_msg.content = system_message.into();
}
// Inject scratchpad into the first User message (which contains the prompt)
for msg in messages.iter_mut() {
if let ChatCompletionRequestMessage::User(user_msg) = msg {
let merged = format!("{scratchpad_text}\n\n{original_user_content}");
user_msg.content =
async_openai::types::ChatCompletionRequestUserMessageContent::Text(merged);
break;
}
}
}
// Runs AFTER the system + scratchpad merge so it measures the
// real prompt size. Includes `tools` because large built-in or
// user tool schemas can be the difference between "under 90%"
// and "shrink-guard floor" — the original messages-only size
// check could miss the exact failure mode this path is meant
// to catch.
let request_tools_for_sizing: Option<&Vec<ChatCompletionTool>> =
if active_tool_schemas.is_empty() || agent_config.disable_native_tools {
None
} else {
Some(&active_tool_schemas)
};
let request_chars = serde_json::to_string(&json!({
"messages": &messages,
"tools": request_tools_for_sizing,
}))
.map(|s| s.len())
.unwrap_or(0);
let messages_chars = request_chars;
let chars_per_token = agent_config
.chars_per_token
.map(|v| v as f32)
.unwrap_or(4.0);
if should_auto_compact(
messages_chars,
agent_config.context_window,
messages.len(),
chars_per_token,
) {
info!(
agent = %agent_config.name,
messages_chars,
context_window = agent_config.context_window,
"auto-invoking compact_history"
);
match compact_message_history(
llm_client,
agent_config,
&messages,
agent_config.compact_history_default_keep,
)
.await
{
Ok(result) if result.compacted_count > 0 => {
// A verbose summary can net-grow the request; this
// path exists only to dodge the shrink-guard, so
// gate the swap on a serialized-size shrink check.
// Use the same `{messages, tools}` envelope as
// `messages_chars` so the comparison is like-for-like.
let new_chars = serde_json::to_string(&json!({
"messages": &result.new_messages,
"tools": request_tools_for_sizing,
}))
.map(|s| s.len())
.unwrap_or(usize::MAX);
if new_chars >= messages_chars {
warn!(
agent = %agent_config.name,
old = messages_chars,
new = new_chars,
"auto-compact did not shrink request; keeping original history"
);
} else {
messages = result.new_messages;
// Stage the appended summary so a too-large
// candidate the squeezer can't shrink never
// gets persisted and re-bloats next prompt.
let mut candidate = scratchpad_content.clone();
if !candidate.is_empty() {
candidate.push('\n');
}
candidate.push_str("[compacted_history (auto)]\n");
candidate.push_str(&result.summary);
let final_scratchpad = match squeeze_scratchpad_if_full(
llm_client,
agent_config,
&candidate,
max_scratchpad_size,
)
.await
{
Ok(Some(squeezed)) => Some(squeezed),
Ok(None) if candidate.len() <= max_scratchpad_size => Some(candidate),
Ok(None) => {
warn!(
agent = %agent_config.name,
candidate_len = candidate.len(),
max_scratchpad_size,
"auto-squeeze produced no shrink and candidate \
still over cap; keeping original scratchpad"
);
None
}
Err(e) => {
warn!(error=%e, "auto-squeeze failed; keeping scratchpad");
None
}
};
if let Some(committed) = final_scratchpad {
scratchpad_content = committed;
// In Evaluating phase the in-memory scratchpad
// is a subset of the canonical store; writing
// back would wipe non-evaluation sections.
if context.phase != DeliberationPhase::Evaluating
&& let Some(store) = &context.store
&& let Err(e) =
store.set(&agent_config.name, &scratchpad_content).await
{
warn!(error=%e, "Failed to persist auto-compacted scratchpad");
}
}
}
}
Ok(_) => {}
Err(e) => warn!(error=%e, "auto-compact failed; SDK shrink-guard takes over"),
}
}
// Create our "partial" request config, which the AiModel implementation will complete.
let request_config = RequestConfig {
messages: messages.clone(),
tools: if active_tool_schemas.is_empty() || agent_config.disable_native_tools {
None
} else {
Some(active_tool_schemas.clone())
},
tool_choice: None,
presence_penalty: if context.phase == DeliberationPhase::Evaluating {
agent_config.presence_penalty.map(|p| p / 2.0)
} else {
agent_config.presence_penalty
},
};
// Estimate input tokens for telemetry
let messages_json = serde_json::to_string(&request_config.messages).unwrap_or_default();
let estimated_input_tokens = (messages_json.len() as f32 / 3.0) as u32;
let request_id = uuid::Uuid::new_v4().to_string();
let ctx = context.telemetry_for();
// `estimated_input_tokens` is a pre-shrink chars/3 heuristic
// and can compute > 100%; clamp so dashboards stay honest.
let context_utilization_pct = if agent_config.context_window > 0 {
((estimated_input_tokens as f64 / agent_config.context_window as f64) * 100.0)
.clamp(0.0, 100.0)
} else {
0.0
};
if context_utilization_pct >= 90.0 {
warn!(
agent = %agent_config.name,
pct = context_utilization_pct,
estimated_input_tokens,
context_window = agent_config.context_window,
"context utilization ≥90% — shrink-guard imminent"
);
} else if context_utilization_pct >= 75.0 {
warn!(
agent = %agent_config.name,
pct = context_utilization_pct,
estimated_input_tokens,
context_window = agent_config.context_window,
"context utilization ≥75%"
);
} else if context_utilization_pct >= 50.0 {
warn!(
agent = %agent_config.name,
pct = context_utilization_pct,
estimated_input_tokens,
context_window = agent_config.context_window,
"context utilization ≥50%"
);
}
let mut span = LlmRequestSpan::start(
telemetry,
&ctx,
&request_id,
outer_attempt,
&agent_config.model_name,
&agent_config.provider_id,
estimated_input_tokens,
context_utilization_pct,
*running_tool_output_bytes,
);
let result = match llm_client
.chat_completion(agent_config, request_config)
.await
{
Ok(res) => res,
Err(e) => {
span.fail(&e).await;
// Dump failure details to file for debugging API errors (opt-in via NSED_FAILURE_DUMPS=1|full)
let phase_str = format!("{:?}", context.phase);
let dump_file = write_failure_dump(FailureDumpParams {
kind: "api_error",
agent_name: &agent_config.name,
model_name: &agent_config.model_name,
provider_id: &agent_config.provider_id,
error: &format!("{e:#}"),
session_id: context.session_id.as_deref(),
phase: Some(&phase_str),
round: Some(context.round_number),
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: last_system_message.as_deref(),
request_body: None,
messages: Some(&messages),
failure_dumps_config: agent_config.failure_dumps.as_deref(),
});
if let Some(ref path) = dump_file {
warn!(
agent_name = %agent_config.name,
error = %e,
dump_file = %path,
"API request failed. Dump saved to file."
);
} else {
warn!(
agent_name = %agent_config.name,
error = %e,
"API request failed. Set NSED_FAILURE_DUMPS=1 to save debug dumps."
);
}
// Preserve the typed `LlmError` through the anyhow
// wrap (`From<LlmError> for anyhow::Error` keeps the
// concrete type via `Error::source` / downcast). The
// retry classifier in `generate_structured_output`
// pattern-matches via `downcast_ref::<LlmError>()`
// rather than scraping the formatted string.
return Err(e.into());
}
};
// Complete LLM request span with telemetry
let cost_usd = {
let usage = result.response.usage.clone();
let (input_tokens, output_tokens, reasoning_tokens, cached_tokens) =
if let Some(u) = &usage {
let cached = u
.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens)
.unwrap_or(0);
let reasoning = u
.completion_tokens_details
.as_ref()
.and_then(|d| d.reasoning_tokens)
.unwrap_or(0);
(u.prompt_tokens, u.completion_tokens, reasoning, cached)
} else {
let content_len = result
.response
.choices
.first()
.and_then(|c| c.message.content.as_deref())
.unwrap_or("")
.len();
let cpt = agent_config.chars_per_token.unwrap_or(4.0).max(0.1);
let out_est = (content_len as f64 / cpt).ceil() as u32;
(estimated_input_tokens, out_est, 0, 0)
};
estimate_llm_cost_usd(
&agent_config.model_name,
input_tokens,
output_tokens,
reasoning_tokens,
cached_tokens,
)
};
// `messages_chars` is the byte-len of the same JSON we already
// serialized for `estimated_input_tokens`. `max_tokens_requested`
// is the agent's configured cap (strategy may override
// internally for vLLM context-shrink, but the requested value
// is what tools the operator's diagnostic view).
let messages_chars = messages_json.len() as u32;
let max_tokens_requested = if agent_config.max_tokens > 0 {
Some(agent_config.max_tokens as u32)
} else {
None
};
span.complete(&result, cost_usd, messages_chars, max_tokens_requested)
.await;
let response = result.response;
let request_body = result.raw_request;
last_request_body = Some(request_body);
let choice = response
.choices
.first()
.context("No choice in LLM response")?;
let finish_reason = choice.finish_reason.map(|r| format!("{r:?}"));
if let Some(usage) = &response.usage {
total_input_tokens += usage.prompt_tokens;
total_output_tokens += usage.completion_tokens;
} else {
// Fallback: estimate tokens using chars_per_token when provider doesn't return usage
let cpt = agent_config.chars_per_token.unwrap_or(4.0).max(0.1);
let input_len = serde_json::to_string(&messages).unwrap_or_default().len();
let output_len = choice.message.content.as_deref().unwrap_or("").len();
total_input_tokens += (input_len as f64 / cpt).ceil() as u32;
total_output_tokens += (output_len as f64 / cpt).ceil() as u32;
}
// Clone the message so we can modify (repair) it before pushing to history.
let mut response_message = choice.message.clone();
// REPAIR TRUNCATED JSON IN TOOL ARGUMENTS
// This prevents "400 Bad Request" errors in the NEXT round if we send back
// a history containing broken JSON strings in tool_calls.
if agent_config.repair_invalid_escapes {
repair_tool_calls(&mut response_message, &agent_config.name);
}
// Keep the original content for heuristic tool scanning (some models put tool calls inside thinking blocks)
let full_content = response_message.content.clone().unwrap_or_default();
let mut content = full_content.clone();
// Check for native thinking blocks
if agent_config.supports_native_thinking {
while let Some(start) = content.find("<think>") {
if let Some(end_offset) = content[start..].find("</think>") {
let end = start + end_offset;
let thought = &content[start + 7..end];
info!(
target: "nsed_activity",
event = "native_thinking",
agent = %agent_config.name,
content = %thought
);
// Strip the thinking block to save context space in history
content.replace_range(start..end + 8, "");
} else {
break;
}
}
}
messages.push(
#[allow(deprecated)]
ChatCompletionRequestAssistantMessage {
content: Some(
async_openai::types::ChatCompletionRequestAssistantMessageContent::Text(
content.clone(),
),
),
tool_calls: if agent_config.disable_native_tools {
None
} else {
response_message.tool_calls.clone()
},
function_call: None,
refusal: None,
name: None,
audio: None,
}
.into(),
);
// Workaround: Some providers (vLLM) return empty tool_calls vec instead of None.
// We treat empty tool_calls as no tool calls (final answer).
let mut tool_calls_list = response_message.tool_calls.clone();
let mut has_tool_calls = tool_calls_list
.as_ref()
.map(|t| !t.is_empty())
.unwrap_or(false);
// If no native tool calls, try to extract tool calls from content
if !has_tool_calls {
let extracted = if agent_config.tool_format.as_deref() == Some("nous") {
extract_xml_tool_calls(&full_content)
} else {
extract_python_tool_calls(&full_content)
};
if !extracted.is_empty() {
// Patch the last message in history to include these tool calls so vLLM accepts the subsequent Tool messages
// BUT only if native tools are enabled. If disabled, we keep history as text-only.
if !agent_config.disable_native_tools
&& let Some(ChatCompletionRequestMessage::Assistant(last_msg)) =
messages.last_mut()
{
last_msg.tool_calls = Some(extracted.clone());
}
tool_calls_list = Some(extracted);
has_tool_calls = true;
info!("Extracted tool calls from text content.");
} else {
// Fallback: Check for implicit JSON tool calls (for models that ignore prompt instructions)
let json_calls = llm_repair::extraction::heuristic_json_tool_calls(&full_content);
if !json_calls.is_empty() {
tool_calls_list = Some(json_calls);
has_tool_calls = true;
info!("Extracted implicit JSON tool calls from content.");
}
}
}
if has_tool_calls {
let tool_calls = tool_calls_list.as_ref().unwrap();
let mut tool_outputs_text = Vec::new();
for tool_call in tool_calls {
let tool_name = &tool_call.function.name;
info!(tool_name = %tool_name, "LLM requested a tool call.");
debug!(tool_arguments = %tool_call.function.arguments, "Tool call arguments.");
info!(
target: "nsed_activity",
event = "tool_call",
agent = %agent_config.name,
tool = %tool_name,
arguments = %tool_call.function.arguments
);
// Check for terminal tool
if terminal_tool_name == Some(tool_name) {
// Guard: if the response was truncated (finish_reason: Length),
// the terminal tool's content is likely clipped. Reject and retry
// with escalated max_tokens instead of returning truncated output.
let was_truncated = finish_reason.as_deref() == Some("Length");
if was_truncated {
warn!(
agent_name = %agent_config.name,
tool = %tool_name,
"Terminal tool call truncated by max_tokens. Rejecting and retrying with more tokens."
);
// Escalate max_tokens for next iteration
let old = agent_config.max_tokens;
let ceiling = if agent_config.context_window > 0 {
(agent_config.context_window as f64 * 0.5) as i32
} else {
32_768
};
let new_tokens = ((old as f64 * 1.5).ceil() as i32).min(ceiling);
if new_tokens > old {
agent_config.max_tokens = new_tokens;
warn!(
agent_name = %agent_config.name,
old_max_tokens = old,
new_max_tokens = new_tokens,
"Escalating max_tokens in react loop after terminal tool truncation."
);
}
// Feed back truncation error so the model retries the submission
let error_text = format!(
"ERROR: Your `{tool_name}` call was truncated (output cut off mid-response). \
Your submission was NOT accepted. You now have more output space. \
Please call `{tool_name}` again with your COMPLETE response. Do NOT shorten or summarize it."
);
if agent_config.disable_native_tools {
// Non-native: use a User message
messages.push(
ChatCompletionRequestMessage::User(ChatCompletionRequestUserMessage {
content:
async_openai::types::ChatCompletionRequestUserMessageContent::Text(
error_text,
),
name: None,
}),
);
} else {
// Native tools: respond with a Tool message to satisfy API protocol
messages.push(
ChatCompletionRequestToolMessage {
tool_call_id: tool_call.id.clone(),
content: ChatCompletionRequestToolMessageContent::Text(
error_text,
),
}
.into(),
);
}
break; // break out of tool_calls loop, continue react loop
}
info!(
"LLM called terminal tool '{}'. Returning arguments.",
tool_name
);
return Ok(AgentResponse {
content: tool_call.function.arguments.clone(),
tool_usage: tool_usage_stats,
finish_reason: finish_reason.or(response_message.refusal.clone()),
input_tokens: Some(total_input_tokens),
output_tokens: Some(total_output_tokens),
system_prompt: last_system_message.clone(),
request_body: last_request_body.clone(),
history: messages.clone(),
final_scratchpad: if scratchpad_content.is_empty() {
None
} else {
Some(scratchpad_content.clone())
},
});
}
// Increment tool usage stats
*tool_usage_stats.entry(tool_name.clone()).or_insert(0) += 1;
let tool_exec_start = std::time::Instant::now();
// `tool_success` is set explicitly in every branch
// alongside `tool_output` so the telemetry emission
// below records the real outcome rather than inferring
// it from the output string's prefix (which would
// miss errors that happen to format without "Error:"
// and would misclassify legitimate outputs that
// happen to start with one). The initial value is
// overwritten in every code path; the explicit
// `false` is a
// compile-time guarantee against an accidental fall-
// through emitting `success: true`.
#[allow(unused_assignments)]
let mut tool_success: bool = false;
let mut tool_output = if tool_name == "update_scratchpad"
&& active_tool_names.contains("update_scratchpad")
{
match serde_json::from_str::<Value>(&tool_call.function.arguments) {
Ok(args) => {
let content = args["content"].as_str().unwrap_or("").to_string();
let mode = args["mode"].as_str().unwrap_or("append");
let projected_len =
if mode == "overwrite" || scratchpad_content.is_empty() {
content.len()
} else {
scratchpad_content.len() + 1 + content.len()
};
if projected_len > max_scratchpad_size {
tool_success = false;
format!(
"Error: Scratchpad update would exceed limit of {} chars. Current: {}. Requested: {}.",
max_scratchpad_size,
scratchpad_content.len(),
projected_len
)
} else {
if mode == "overwrite" {
scratchpad_content = content;
} else {
if !scratchpad_content.is_empty() {
scratchpad_content.push('\n');
}
scratchpad_content.push_str(&content);
}
if let Some(store) = &context.store
&& let Err(e) =
store.set(&agent_config.name, &scratchpad_content).await
{
warn!(error=%e, "Failed to persist scratchpad to stream store.");
}
tool_success = true;
format!(
"Scratchpad updated. Current size: {} chars.",
scratchpad_content.len()
)
}
}
Err(e) => {
tool_success = false;
format!("Error parsing arguments: {e}")
}
}
} else if tool_name == "compact_history"
&& active_tool_names.contains("compact_history")
{
let keep = serde_json::from_str::<Value>(&tool_call.function.arguments)
.ok()
.and_then(|v| v["keep_last_n_calls"].as_u64())
.map(|n| n.clamp(1, 10) as usize)
.unwrap_or(agent_config.compact_history_default_keep.max(1));
match compact_message_history(llm_client, agent_config, &messages, keep).await {
Ok(result) if result.compacted_count == 0 => {
tool_success = true;
"compact_history: nothing to compact (history already short)."
.to_string()
}
Ok(result) => {
messages = result.new_messages;
// Stage the appended summary so a too-large
// candidate that the squeezer can't shrink
// never gets persisted and re-bloats next prompt.
let mut candidate = scratchpad_content.clone();
if !candidate.is_empty() {
candidate.push('\n');
}
candidate.push_str("[compacted_history]\n");
candidate.push_str(&result.summary);
let final_scratchpad = match squeeze_scratchpad_if_full(
llm_client,
agent_config,
&candidate,
max_scratchpad_size,
)
.await
{
Ok(Some(squeezed)) => {
info!(
agent = %agent_config.name,
old_len = candidate.len(),
new_len = squeezed.len(),
"scratchpad auto-squeezed during compact_history"
);
Some(squeezed)
}
Ok(None) => {
if max_scratchpad_size == 0
|| candidate.len() <= max_scratchpad_size
{
Some(candidate)
} else {
warn!(
agent = %agent_config.name,
candidate_len = candidate.len(),
max = max_scratchpad_size,
"compact_history candidate exceeds scratchpad cap and \
squeeze couldn't recover; keeping pre-compaction \
scratchpad"
);
None
}
}
Err(e) => {
warn!(error=%e, "scratchpad squeeze failed; keeping pre-compaction scratchpad");
None
}
};
if let Some(committed) = final_scratchpad {
scratchpad_content = committed;
// Same Eval-phase guard as the auto-invoke
// path above.
if context.phase != DeliberationPhase::Evaluating
&& let Some(store) = &context.store
&& let Err(e) =
store.set(&agent_config.name, &scratchpad_content).await
{
warn!(error=%e, "Failed to persist scratchpad after compact_history.");
}
}
tool_success = true;
format!(
"Compacted {} earlier tool calls into scratchpad. \
Scratchpad now {} chars. Continue reasoning with \
the compacted history.",
result.compacted_count,
scratchpad_content.len()
)
}
Err(e) => {
warn!(error=%e, "compact_history call failed");
tool_success = false;
format!("compact_history failed: {e}")
}
}
} else if !active_tool_names.contains(tool_name.as_str()) {
// Tool was stripped (e.g., force_finalize); reject the call
warn!(tool_name = %tool_name, "LLM called a stripped tool (not in active set).");
tool_success = false;
format!("Error: Tool `{tool_name}` is not available in the current phase.")
} else if let Some(tool) = tool_map.get(tool_name) {
let arg_str = &tool_call.function.arguments;
let args_result = if arg_str.trim().is_empty() {
Ok(serde_json::json!({}))
} else {
serde_json::from_str::<Value>(arg_str).map_err(|e| e.to_string())
};
match args_result {
Ok(args) => match tool.call(args).await {
Ok(result) => {
debug!(tool_name = %tool_name, output_length = result.len(), "Tool executed successfully.");
tool_success = true;
result
}
Err(e) => {
warn!(tool_name = %tool_name, error = %e, "Tool execution failed.");
tool_success = false;
format!("Error calling tool `{tool_name}`: {e}")
}
},
Err(e) => {
let msg = format!(
"Error parsing arguments for tool '{}': {}. Raw arguments: '{}'",
tool_name, e, arg_str
);
warn!(tool_name=%tool_name, error=%e, raw_args=%arg_str, "Failed to parse tool arguments.");
tool_success = false;
msg
}
}
} else {
warn!(tool_name = %tool_name, "LLM called a tool that does not exist.");
tool_success = false;
format!("Error: Tool `{tool_name}` not found.")
};
let truncated = apply_tool_output_cap(
&mut tool_output,
agent_config.context_window,
estimated_input_tokens,
tool_name,
);
let latency_ms = tool_exec_start.elapsed().as_millis() as u64;
let output_bytes = tool_output.len() as u64;
// 4 chars/token rule-of-thumb; ceiling so non-empty
// outputs under 4 bytes still report 1 token.
let output_tokens_estimated =
Some((output_bytes.div_ceil(4)).min(u32::MAX as u64) as u32);
*running_tool_output_bytes = running_tool_output_bytes.saturating_add(output_bytes);
emit_for!(
context,
ToolCallExecuted {
tool_name: tool_name.clone(),
latency_ms,
success: tool_success,
output_bytes,
output_tokens_estimated,
truncated,
paginated: false,
}
);
info!(
target: "nsed_activity",
event = "tool_output",
agent = %agent_config.name,
tool = %tool_name,
output = %tool_output
);
if agent_config.disable_native_tools {
tool_outputs_text.push(format!("Tool Output ({tool_name}): {tool_output}"));
} else {
messages.push(
ChatCompletionRequestToolMessage {
tool_call_id: tool_call.id.clone(),
content: ChatCompletionRequestToolMessageContent::Text(tool_output),
}
.into(),
);
}
}
if agent_config.disable_native_tools && !tool_outputs_text.is_empty() {
// Consolidate all tool outputs into a single User message to satisfy strict chat templates
let combined_output = tool_outputs_text.join("\n\n");
messages.push(
ChatCompletionRequestUserMessage {
content: async_openai::types::ChatCompletionRequestUserMessageContent::Text(
combined_output,
),
..Default::default()
}
.into(),
);
}
} else {
// No tool calls, this is the final answer.
info!("LLM provided a final answer without a tool call.");
return Ok(AgentResponse {
content,
tool_usage: tool_usage_stats,
finish_reason,
input_tokens: Some(total_input_tokens),
output_tokens: Some(total_output_tokens),
system_prompt: last_system_message,
request_body: last_request_body,
history: messages.clone(),
final_scratchpad: if scratchpad_content.is_empty() {
None
} else {
Some(scratchpad_content.clone())
},
});
}
// Record this iteration's duration for the time-aware tool stripping heuristic.
iteration_durations.push(iter_start.elapsed());
}
// Max-iterations exhaustion: instead of bubbling an error that
// blocks the orchestrator waiting for this agent's proposal /
// evaluation, synthesize an empty-but-schema-valid terminal tool
// response so the worker publishes SOMETHING on the agent's phase
// subject. The orchestrator then aggregates it with the rest and
// the deliberation proceeds. Erroring here would have left peers
// blocked on the per-phase SLA floor before the missing agent got
// an implicit max-score injection — expensive and visible.
//
// The synthetic content mirrors the expected terminal tool's JSON
// shape so the caller's `serde_json::from_str::<T>(...)` succeeds
// on first try (no retry loop burning budget with a known-stuck
// agent):
// - submit_proposal: empty solution_content, explanation
// in thought_process so downstream
// scoring sees the agent "voted
// empty" rather than crashed.
// - submit_batch_evaluation: evaluations: [] — no votes cast.
// - unknown terminal tool: {} (fall back to generic object;
// caller may still fail to deserialize,
// but at least we don't crash here).
warn!(
agent = %agent_config.name,
terminal_tool = ?terminal_tool_name,
"Agent reached maximum iterations without a final answer — emitting empty synthetic terminal tool response so peers are not blocked."
);
let synthetic_content = empty_terminal_tool_content(terminal_tool_name);
Ok(AgentResponse {
content: synthetic_content,
tool_usage: tool_usage_stats,
finish_reason: Some("max_iterations".to_string()),
input_tokens: Some(total_input_tokens),
output_tokens: Some(total_output_tokens),
system_prompt: last_system_message,
request_body: last_request_body,
history: messages.clone(),
final_scratchpad: if scratchpad_content.is_empty() {
None
} else {
Some(scratchpad_content.clone())
},
})
}
/// Size of the "terminate or die" tail window — the last N iterations
/// of `react_loop` have non-terminal tools stripped so the model's
/// only legal move is to call the terminal tool. N retries absorb
/// transient malformed terminal calls (truncation, bad JSON).
/// Override per-workload via `NSED_FINALIZE_WINDOW`; default 3
/// balances empirical LLM tool-call error rates (~5–20% per attempt)
/// against wasted free-thinking budget.
///
/// Pure-input form (consumes `Option<&str>` instead of reading the env
/// directly) so the resolution can be unit-tested without env-var
/// race hazards.
const DEFAULT_FINALIZE_WINDOW: usize = 3;
fn resolve_finalize_window(raw: Option<&str>) -> usize {
raw.and_then(|s| s.parse::<usize>().ok())
.filter(|n| *n > 0)
.unwrap_or(DEFAULT_FINALIZE_WINDOW)
}
#[cfg(test)]
mod tests {
use super::{
DEFAULT_FINALIZE_WINDOW, FailureDumpParams, StructuredBatchEvaluationResponse,
StructuredProposalResponse, apply_tool_output_cap, empty_terminal_tool_content,
extract_evaluation_sections, resolve_finalize_window, strip_scratchpad,
strip_thinking_prefix, strip_working_memory, write_failure_dump,
};
use serial_test::serial;
#[test]
fn finalize_window_defaults_when_env_unset() {
assert_eq!(resolve_finalize_window(None), DEFAULT_FINALIZE_WINDOW);
assert_eq!(resolve_finalize_window(None), 3);
}
#[test]
fn finalize_window_parses_valid_value() {
assert_eq!(resolve_finalize_window(Some("5")), 5);
assert_eq!(resolve_finalize_window(Some("1")), 1);
assert_eq!(resolve_finalize_window(Some("10")), 10);
}
#[test]
fn finalize_window_falls_back_on_malformed_value() {
assert_eq!(
resolve_finalize_window(Some("abc")),
DEFAULT_FINALIZE_WINDOW
);
assert_eq!(resolve_finalize_window(Some("")), DEFAULT_FINALIZE_WINDOW);
assert_eq!(
resolve_finalize_window(Some("3.5")),
DEFAULT_FINALIZE_WINDOW
);
assert_eq!(resolve_finalize_window(Some("-1")), DEFAULT_FINALIZE_WINDOW);
}
#[test]
fn finalize_window_rejects_zero() {
// Zero would disable the tail-window guard entirely (the
// `max_iterations > N && remaining_iterations <= N` check
// collapses to "never trigger"); treat as malformed and fall
// back so a stray `=0` env doesn't silently turn the safety
// net off.
assert_eq!(resolve_finalize_window(Some("0")), DEFAULT_FINALIZE_WINDOW);
}
use super::{
ChatCompletionRequestAssistantMessage, ChatCompletionRequestMessage,
ChatCompletionRequestSystemMessage, ChatCompletionRequestToolMessage,
ChatCompletionRequestToolMessageContent, ChatCompletionRequestUserMessage,
ChatCompletionToolType, compact_message_history, should_auto_compact,
squeeze_scratchpad_if_full,
};
use crate::agents::config::AgentConfig;
use crate::llms::{AiModel, ChatCompletionResult, RequestConfig, TimingMetadata};
use crate::telemetry::LlmError;
use async_openai::types::{
ChatChoice, ChatCompletionMessageToolCall, ChatCompletionResponseMessage, CompletionUsage,
CreateChatCompletionResponse, FinishReason as OAFinishReason, FunctionCall, Role,
};
use async_trait::async_trait;
#[derive(Clone, Debug)]
struct CannedSummaryModel {
text: String,
}
#[async_trait]
impl AiModel for CannedSummaryModel {
async fn chat_completion(
&self,
_agent: &AgentConfig,
_request_config: RequestConfig,
) -> Result<ChatCompletionResult, LlmError> {
Ok(ChatCompletionResult {
response: CreateChatCompletionResponse {
id: "canned".into(),
object: "chat.completion".into(),
created: 0,
model: "canned".into(),
choices: vec![ChatChoice {
index: 0,
message: ChatCompletionResponseMessage {
role: Role::Assistant,
content: Some(self.text.clone()),
tool_calls: None,
#[allow(deprecated)]
function_call: None,
refusal: None,
audio: None,
},
finish_reason: Some(OAFinishReason::Stop),
logprobs: None,
}],
usage: Some(CompletionUsage {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
prompt_tokens_details: None,
completion_tokens_details: None,
}),
service_tier: None,
system_fingerprint: None,
},
raw_request: String::new(),
timing: TimingMetadata {
ttft_ms: None,
generation_ms: None,
},
provider_backend: None,
shrink_info: None,
})
}
}
/// Echoes back the last user message in the request, so tests can
/// assert that specific text actually reached the summariser
/// prompt.
#[derive(Clone, Debug, Default)]
struct EchoSummaryModel;
#[async_trait]
impl AiModel for EchoSummaryModel {
async fn chat_completion(
&self,
_agent: &AgentConfig,
request_config: RequestConfig,
) -> Result<ChatCompletionResult, LlmError> {
let prompt = request_config
.messages
.iter()
.rev()
.find_map(|m| {
if let ChatCompletionRequestMessage::User(u) = m {
if let async_openai::types::ChatCompletionRequestUserMessageContent::Text(
t,
) = &u.content
{
return Some(t.clone());
}
}
None
})
.unwrap_or_default();
Ok(ChatCompletionResult {
response: CreateChatCompletionResponse {
id: "echo".into(),
object: "chat.completion".into(),
created: 0,
model: "echo".into(),
choices: vec![ChatChoice {
index: 0,
message: ChatCompletionResponseMessage {
role: Role::Assistant,
content: Some(prompt),
tool_calls: None,
#[allow(deprecated)]
function_call: None,
refusal: None,
audio: None,
},
finish_reason: Some(OAFinishReason::Stop),
logprobs: None,
}],
usage: Some(CompletionUsage {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
prompt_tokens_details: None,
completion_tokens_details: None,
}),
service_tier: None,
system_fingerprint: None,
},
raw_request: String::new(),
timing: TimingMetadata {
ttft_ms: None,
generation_ms: None,
},
provider_backend: None,
shrink_info: None,
})
}
}
fn agent_for_compaction() -> AgentConfig {
AgentConfig {
name: "compactor".into(),
..Default::default()
}
}
fn synth_tool_pair(
call_id: &str,
tool_name: &str,
result: &str,
) -> Vec<ChatCompletionRequestMessage> {
vec![
ChatCompletionRequestAssistantMessage {
tool_calls: Some(vec![ChatCompletionMessageToolCall {
id: call_id.into(),
r#type: ChatCompletionToolType::Function,
function: FunctionCall {
name: tool_name.into(),
arguments: "{}".into(),
},
}]),
..Default::default()
}
.into(),
ChatCompletionRequestToolMessage {
tool_call_id: call_id.into(),
content: ChatCompletionRequestToolMessageContent::Text(result.into()),
}
.into(),
]
}
#[tokio::test]
async fn compact_history_noop_when_below_keep_threshold() {
let model = CannedSummaryModel {
text: "should not run".into(),
};
let agent = agent_for_compaction();
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestUserMessage {
content: "go".into(),
..Default::default()
}
.into(),
];
messages.extend(synth_tool_pair("c1", "read_file", "FILE A CONTENT"));
messages.extend(synth_tool_pair("c2", "read_file", "FILE B CONTENT"));
let original = messages.clone();
let result = compact_message_history(&model, &agent, &messages, 2)
.await
.unwrap();
assert_eq!(result.compacted_count, 0);
assert_eq!(result.summary, "");
assert_eq!(result.new_messages, original);
}
#[tokio::test]
async fn compact_history_folds_older_tool_calls_into_summary() {
let model = CannedSummaryModel {
text: "Read /linux/scripts/checkpatch.pl lines 1-200; running hypothesis: \
`--strict` enabled; cited verbatim: ERROR(\"missing newline\")"
.into(),
};
let agent = agent_for_compaction();
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestSystemMessage {
content: "system".into(),
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: "go".into(),
..Default::default()
}
.into(),
];
messages.extend(synth_tool_pair("c1", "read_file", "FILE A — 60K of source"));
messages.extend(synth_tool_pair("c2", "grep_search", "8K of matches"));
messages.extend(synth_tool_pair("c3", "pdf_query", "5K of pdf hits"));
messages.extend(synth_tool_pair("c4", "read_file", "FILE D — 12K"));
let pre_len = messages.len();
let result = compact_message_history(&model, &agent, &messages, 2)
.await
.unwrap();
assert_eq!(
result.compacted_count, 2,
"4 tool calls minus keep=2 = 2 folded"
);
assert!(
result.summary.contains("checkpatch.pl"),
"summary text round-trips"
);
assert!(
result.new_messages.len() < pre_len,
"compacted history must be shorter than input ({} → {})",
pre_len,
result.new_messages.len()
);
let has_compact_call = result.new_messages.iter().any(|m| {
if let ChatCompletionRequestMessage::Assistant(a) = m
&& let Some(tcs) = &a.tool_calls
{
tcs.iter().any(|tc| tc.function.name == "compact_history")
} else {
false
}
});
assert!(has_compact_call);
}
fn synth_user_tool_output(
call_id: &str,
tool_name: &str,
result: &str,
) -> Vec<ChatCompletionRequestMessage> {
vec![
ChatCompletionRequestAssistantMessage {
content: Some(
async_openai::types::ChatCompletionRequestAssistantMessageContent::Text(
format!("calling {tool_name}"),
),
),
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: format!("Tool Output ({call_id} {tool_name}): {result}").into(),
..Default::default()
}
.into(),
]
}
#[tokio::test]
async fn compact_history_text_mode_includes_user_tool_output_in_summary() {
// Sentinel echoes back its prompt so we can assert that the
// user-rewritten tool result actually reached the summariser.
let agent = AgentConfig {
disable_native_tools: true,
..agent_for_compaction()
};
let model = EchoSummaryModel;
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestSystemMessage {
content: "system".into(),
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: "go".into(),
..Default::default()
}
.into(),
];
messages.extend(synth_user_tool_output("c1", "read_file", "FILE_A_BODY"));
messages.extend(synth_user_tool_output("c2", "grep_search", "GREP_HITS"));
messages.extend(synth_user_tool_output("c3", "pdf_query", "PDF_HITS"));
let result = compact_message_history(&model, &agent, &messages, 1)
.await
.unwrap();
let echoed = result.summary.clone();
assert!(
echoed.contains("FILE_A_BODY") || echoed.contains("GREP_HITS"),
"summariser must see at least one User(Tool Output) body, got: {echoed}"
);
assert!(result.compacted_count >= 1);
}
#[tokio::test]
async fn compact_history_text_mode_emits_text_only_synthetic_pair() {
let agent = AgentConfig {
disable_native_tools: true,
..agent_for_compaction()
};
let model = CannedSummaryModel {
text: "summary".into(),
};
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestSystemMessage {
content: "system".into(),
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: "go".into(),
..Default::default()
}
.into(),
];
messages.extend(synth_user_tool_output("c1", "read_file", "A"));
messages.extend(synth_user_tool_output("c2", "read_file", "B"));
messages.extend(synth_user_tool_output("c3", "read_file", "C"));
let result = compact_message_history(&model, &agent, &messages, 1)
.await
.unwrap();
// No native tool_calls Assistant or Tool roles must appear in
// the rewritten history — would break the next provider call
// when `tools: None` is sent.
for m in &result.new_messages {
assert!(
!matches!(m, ChatCompletionRequestMessage::Tool(_)),
"text-mode compact must not emit native Tool role"
);
if let ChatCompletionRequestMessage::Assistant(a) = m {
assert!(
a.tool_calls.is_none() || a.tool_calls.as_ref().unwrap().is_empty(),
"text-mode compact must not emit native tool_calls"
);
}
}
// The synthetic compaction must surface as a `Tool Output (compact_history)`
// user message so subsequent is_tool_boundary scans see it.
let has_marker = result.new_messages.iter().any(|m| {
if let ChatCompletionRequestMessage::User(u) = m
&& let async_openai::types::ChatCompletionRequestUserMessageContent::Text(t) =
&u.content
{
t.contains("Tool Output (compact_history)")
} else {
false
}
});
assert!(has_marker, "missing compact_history user-message marker");
}
#[tokio::test]
async fn compact_history_clamps_keep_zero_to_one() {
// Keep=0 would index `tool_msg_indices[len()]` and panic. The
// internal `.max(1)` clamp must absorb the bad input.
let agent = agent_for_compaction();
let model = CannedSummaryModel {
text: "summary".into(),
};
let mut messages: Vec<ChatCompletionRequestMessage> = vec![
ChatCompletionRequestSystemMessage {
content: "system".into(),
..Default::default()
}
.into(),
ChatCompletionRequestUserMessage {
content: "go".into(),
..Default::default()
}
.into(),
];
messages.extend(synth_tool_pair("c1", "read_file", "A"));
messages.extend(synth_tool_pair("c2", "read_file", "B"));
let result = compact_message_history(&model, &agent, &messages, 0)
.await
.unwrap();
assert_eq!(result.compacted_count, 1, "keep=0 coerced to keep=1");
}
#[tokio::test]
async fn scratchpad_squeeze_noop_when_under_threshold() {
let model = CannedSummaryModel {
text: "should not run".into(),
};
let agent = agent_for_compaction();
let scratchpad = "x".repeat(500);
let max = 1000;
let result = squeeze_scratchpad_if_full(&model, &agent, &scratchpad, max)
.await
.unwrap();
assert!(
result.is_none(),
"≤95% scratchpad must not trigger LLM call"
);
}
#[tokio::test]
async fn scratchpad_squeeze_compresses_when_over_threshold() {
let model = CannedSummaryModel {
text: "[compressed older sections]".into(),
};
let agent = agent_for_compaction();
let scratchpad = "x".repeat(960);
let max = 1000; // 96% full → over the threshold
let result = squeeze_scratchpad_if_full(&model, &agent, &scratchpad, max)
.await
.unwrap();
let squeezed = result.expect("≥95% scratchpad triggers squeeze");
assert!(squeezed.contains("[compressed older sections]"));
assert!(
squeezed.len() < scratchpad.len(),
"squeezed must be shorter"
);
}
#[test]
fn scratchpad_squeeze_fraction_default_pinned() {
let cfg = AgentConfig::default();
assert!((cfg.scratchpad_squeeze_fraction - 0.95).abs() < f64::EPSILON);
assert_eq!(cfg.compact_history_default_keep, 2);
}
#[test]
fn auto_compact_triggers_at_or_above_90_percent() {
assert!(should_auto_compact(480_000, 131_072, 10, 4.0));
assert!(should_auto_compact(360, 100, 10, 4.0));
}
#[test]
fn auto_compact_skips_under_90_percent() {
assert!(!should_auto_compact(200_000, 131_072, 10, 4.0));
}
#[test]
fn auto_compact_disabled_when_context_window_unknown() {
// ctx <= 0 means provider doesn't expose it; let the SDK
// shrink-guard alone handle overflow.
assert!(!should_auto_compact(1_000_000, 0, 100, 4.0));
assert!(!should_auto_compact(1_000_000, -1, 100, 4.0));
}
#[test]
fn auto_compact_skips_when_history_too_short() {
// High utilization but only 4 messages: nothing useful to
// fold, so skip and let the SDK shrink-guard engage.
assert!(!should_auto_compact(480_000, 131_072, 4, 4.0));
assert!(!should_auto_compact(480_000, 131_072, 1, 4.0));
}
#[test]
fn auto_compact_honours_configured_chars_per_token() {
// CJK / code: ~1.5 chars per token. 200_000 chars at 1.5 →
// 133k tokens which crosses 90% of 131k.
assert!(should_auto_compact(200_000, 131_072, 10, 1.5));
// Same char count at default 4.0 stays under threshold.
assert!(!should_auto_compact(200_000, 131_072, 10, 4.0));
// Non-positive cpt falls back to the 4.0 default rather than
// dividing by zero / panicking.
assert!(!should_auto_compact(200_000, 131_072, 10, 0.0));
}
#[test]
fn tool_cap_truncates_when_output_exceeds_fraction_of_remaining_context() {
// 131k ctx, 10k input → ~121k tokens remaining, 10% in chars
// ≈ 48400 bytes. A 100KB result must clip.
let mut output = "X".repeat(100_000);
let truncated = apply_tool_output_cap(&mut output, 131_072, 10_000, "read_file");
assert!(truncated, "100KB result against 131k ctx must clip");
assert!(
output.len() < 100_000,
"output must be shorter than original after cap"
);
assert!(
output.contains("[truncated:"),
"marker must be present so the model sees the clip"
);
}
#[test]
fn tool_cap_passthrough_when_under_budget() {
// 4KB result against 131k ctx → way under 10% cap → no clip.
let mut output = "Y".repeat(4_096);
let truncated = apply_tool_output_cap(&mut output, 131_072, 1_000, "read_file");
assert!(!truncated);
assert_eq!(output.len(), 4_096, "small output must pass through");
assert!(!output.contains("[truncated:"));
}
#[test]
fn tool_cap_noop_when_context_window_unknown() {
let mut output = "Z".repeat(10_000);
let truncated = apply_tool_output_cap(&mut output, 0, 1_000, "read_file");
assert!(!truncated);
assert_eq!(output.len(), 10_000);
}
#[test]
fn tool_cap_respects_utf8_char_boundary() {
// Every char is the 3-byte 中. If `cap_bytes` lands inside a
// multi-byte char, naive `truncate` panics — `apply_tool_output_cap`
// walks back to a boundary first.
let mut output = "中".repeat(2_000);
let original_byte_len = output.len();
let truncated = apply_tool_output_cap(&mut output, 4_096, 0, "read_file");
assert!(truncated);
assert!(output.len() < original_byte_len);
assert!(output.contains("[truncated:"));
// The truncated prefix (everything before the marker) must
// be a sequence of full `中` codepoints — no half-character
// landed at the boundary.
let prefix = output
.split("\n[truncated:")
.next()
.expect("marker delimiter must exist");
assert!(
prefix.chars().all(|c| c == '中'),
"truncated prefix must end on a full UTF-8 character; got {prefix:?}"
);
}
#[test]
fn tool_cap_replaces_with_marker_when_remaining_budget_zero() {
// `estimated_input_tokens >= context_window` means
// `remaining_tokens == 0` and `cap_bytes == 0`. The cap
// must replace the entire payload with a marker so the
// model sees the empty-budget signal instead of inheriting
// verbatim oversized output.
let mut output = "X".repeat(50_000);
let truncated = apply_tool_output_cap(&mut output, 1_000, 1_000, "read_file");
assert!(truncated);
assert!(
output.starts_with("[truncated: no remaining context budget"),
"marker must replace output when budget is 0; got {:?}",
&output[..50.min(output.len())]
);
assert!(output.len() < 200, "marker must be short");
}
// Prompt-exposure guardrail tests live alongside any concrete
// `OutputLeakDetector` implementations — see the `with_output_guard`
// doc-comment on `ProposerEvaluatorAgent`.
// ─── empty_terminal_tool_content — max-iterations fallback ───────
//
// On exhaustion, `react_loop` emits a schema-valid but semantically
// empty response so the orchestrator can proceed without waiting
// on a stuck agent. The response must deserialize into the expected
// terminal tool's struct without error.
#[test]
fn empty_submit_proposal_content_has_explanatory_thought_and_empty_solution() {
let json_str = empty_terminal_tool_content(Some("submit_proposal"));
let parsed: StructuredProposalResponse = serde_json::from_str(&json_str)
.expect("must round-trip into StructuredProposalResponse");
assert!(
parsed.thought_process.contains("maximum iterations"),
"thought_process should explain why the agent voted empty"
);
assert_eq!(
parsed.solution_content, "",
"solution_content must be empty"
);
}
#[test]
fn empty_submit_batch_evaluation_content_is_empty_array() {
let json_str = empty_terminal_tool_content(Some("submit_batch_evaluation"));
let parsed: StructuredBatchEvaluationResponse = serde_json::from_str(&json_str)
.expect("must round-trip into StructuredBatchEvaluationResponse");
assert!(parsed.evaluations.is_empty(), "evaluations must be empty");
}
#[test]
fn empty_terminal_tool_content_unknown_name_returns_empty_object() {
assert_eq!(empty_terminal_tool_content(None), "{}");
assert_eq!(empty_terminal_tool_content(Some("other_tool")), "{}");
}
#[test]
fn strip_scratchpad_removes_block() {
let input = "\n\n<scratchpad>\n(This persists across tool calls.)\nsome notes\n</scratchpad>\n\nOriginal prompt text here";
let result = strip_scratchpad(input);
assert_eq!(result, "Original prompt text here");
assert!(!result.contains("<scratchpad>"));
}
#[test]
fn strip_scratchpad_preserves_text_without_block() {
let input = "Just a normal prompt with no scratchpad";
assert_eq!(strip_scratchpad(input), input);
}
#[test]
fn strip_scratchpad_handles_only_scratchpad() {
let input = "<scratchpad>\nsome notes\n</scratchpad>";
let result = strip_scratchpad(input);
assert_eq!(result, "");
}
#[test]
fn strip_scratchpad_handles_unclosed_tag() {
let input = "<scratchpad>\nsome notes without closing tag\nOriginal text";
let result = strip_scratchpad(input);
// Unclosed tag — return as-is
assert_eq!(result, input);
}
#[test]
fn strip_scratchpad_preserves_text_before_and_after() {
let input = "Before text\n\n<scratchpad>\nnotes\n</scratchpad>\n\nAfter text";
let result = strip_scratchpad(input);
assert_eq!(result, "Before text\n\nAfter text");
}
/// When merge_system_prompt is true and a retry provides input_history that
/// already contains "{sys_prompt}\n\n<scratchpad>...</scratchpad>\n\n{user_query}",
/// extracting original_user_content must yield only the user_query (after
/// </scratchpad>), not the entire text with system prompt prefix.
#[test]
fn strip_scratchpad_merge_system_prompt_extracts_after_closing_tag() {
let merged = "You are a helpful agent.\n\n<scratchpad>\n(This persists across tool calls.)\n(Empty)\n</scratchpad>\n\nWhat is the meaning of life?";
// Simulates what the merge_system_prompt branch does: take everything
// after </scratchpad>, trimmed.
let end_idx = merged.find("</scratchpad>").unwrap();
let extracted = merged[end_idx + "</scratchpad>".len()..].trim_start();
assert_eq!(extracted, "What is the meaning of life?");
}
#[test]
fn test_utf8_safe_truncation() {
// "日本語テスト" is 6 chars but 18 bytes
let mut note = "日本語テスト".to_string();
let max_len = 10; // Would land in the middle of a multi-byte char
let suffix = "...(truncated)";
if note.len() > max_len {
let truncate_at = max_len.saturating_sub(suffix.len());
let safe_at = note
.char_indices()
.map(|(i, _)| i)
.take_while(|&i| i <= truncate_at)
.last()
.unwrap_or(0);
note.truncate(safe_at);
note.push_str(suffix);
}
// Should not panic and should be valid UTF-8
assert!(note.is_char_boundary(0));
assert!(note.ends_with(suffix));
}
// =========================================================================
// strip_working_memory() Tests
// =========================================================================
#[test]
fn strip_working_memory_removes_single_block() {
let input = "<working_memory>ephemeral thoughts</working_memory>";
assert_eq!(strip_working_memory(input), "");
}
#[test]
fn strip_working_memory_preserves_other_sections() {
let input = "<key_findings>important stuff</key_findings>\n\n<working_memory>scratch</working_memory>\n\n<strategy>my plan</strategy>";
let result = strip_working_memory(input);
assert!(result.contains("<key_findings>important stuff</key_findings>"));
assert!(result.contains("<strategy>my plan</strategy>"));
assert!(!result.contains("<working_memory>"));
assert!(!result.contains("scratch"));
}
#[test]
fn strip_working_memory_handles_multiple_blocks() {
let input = "<working_memory>block1</working_memory>\n\nsome text\n\n<working_memory>block2</working_memory>";
let result = strip_working_memory(input);
assert!(!result.contains("<working_memory>"));
assert!(result.contains("some text"));
}
#[test]
fn strip_working_memory_handles_unclosed_tag() {
let input = "<key_findings>ok</key_findings>\n\n<working_memory>unclosed block";
let result = strip_working_memory(input);
// Falls back to keeping malformed content as-is
assert!(result.contains("<working_memory>"));
}
#[test]
fn strip_working_memory_no_op_when_absent() {
let input = "<key_findings>data</key_findings>\n<strategy>plan</strategy>";
assert_eq!(strip_working_memory(input), input);
}
#[test]
fn strip_working_memory_preserves_text_before_and_after() {
let input = "Before\n\n<working_memory>temp</working_memory>\n\nAfter";
let result = strip_working_memory(input);
assert_eq!(result, "Before\n\nAfter");
}
// =========================================================================
// extract_evaluation_sections() Tests
// =========================================================================
#[test]
fn extract_evaluation_sections_both_present() {
let input = "<working_memory>trash</working_memory>\n<key_findings>findings here</key_findings>\n<strategy>my strategy</strategy>";
let result = extract_evaluation_sections(input);
assert!(result.contains("<key_findings>findings here</key_findings>"));
assert!(result.contains("<strategy>my strategy</strategy>"));
assert!(!result.contains("working_memory"));
assert!(!result.contains("trash"));
}
#[test]
fn extract_evaluation_sections_only_key_findings() {
let input = "some preamble\n<key_findings>important data</key_findings>\nother stuff";
let result = extract_evaluation_sections(input);
assert_eq!(result, "<key_findings>important data</key_findings>");
}
#[test]
fn extract_evaluation_sections_only_strategy() {
let input = "prefix\n<strategy>my plan</strategy>\nsuffix";
let result = extract_evaluation_sections(input);
assert_eq!(result, "<strategy>my plan</strategy>");
}
#[test]
fn extract_evaluation_sections_fallback_when_none() {
let input = "No structured sections here, just plain text notes";
let result = extract_evaluation_sections(input);
assert_eq!(
result, input,
"Should fall back to full text when no sections found"
);
}
#[test]
fn extract_evaluation_sections_handles_multiline_content() {
let input = "<key_findings>\n - Finding 1\n - Finding 2\n</key_findings>\n<strategy>\n Step A\n Step B\n</strategy>";
let result = extract_evaluation_sections(input);
assert!(result.contains("Finding 1"));
assert!(result.contains("Finding 2"));
assert!(result.contains("Step A"));
}
#[test]
fn extract_evaluation_sections_ignores_unclosed_tags() {
let input = "<key_findings>open section without closing\n<strategy>also unclosed";
let result = extract_evaluation_sections(input);
// Both unclosed → falls back to full text
assert_eq!(result, input);
}
// =========================================================================
// Regression: StructuredProposalResponse parsing
// From failures/quant-ml_MACRO/parse_error_r1.md
// =========================================================================
/// Regression: Mistral returns thought_process as array of strings instead of
/// a single string. Previously failed with "invalid type: sequence, expected a string".
#[test]
fn regression_proposal_thought_process_as_array() {
let json = serde_json::json!({
"thought_process": [
"Step 1: Asset Class Allocation",
"Current allocation is split as Equities (50%), Fixed Income (30%), Alternatives (20%).",
"Step 2: Geographic Tilt",
"The geographic tilt remains balanced."
],
"solution_content": "Proposed allocation: Equities 45%, Fixed Income 35%, Alternatives 20%"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert!(
result
.thought_process
.contains("Step 1: Asset Class Allocation")
);
assert!(result.thought_process.contains("Step 2: Geographic Tilt"));
// Array elements joined with newlines
assert!(result.thought_process.contains('\n'));
assert_eq!(
result.solution_content,
"Proposed allocation: Equities 45%, Fixed Income 35%, Alternatives 20%"
);
}
/// Regression: Mistral wraps solution_content in a nested object instead of a string.
/// Previously failed with "invalid type: map, expected a string".
#[test]
fn regression_proposal_solution_content_as_nested_object() {
let json = serde_json::json!({
"solution_content": {
"thought_process": ["Step 1", "Step 2"],
"allocations": {"Equities": 45, "Fixed Income": 35},
"hedge_portfolio": {"Strategy": "Protective Puts"}
},
"thought_process": "My reasoning steps."
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "My reasoning steps.");
// Nested object serialized to JSON string
assert!(result.solution_content.contains("Equities"));
assert!(result.solution_content.contains("Protective Puts"));
}
/// Normal string values still work as expected.
#[test]
fn proposal_response_with_normal_strings() {
let json = serde_json::json!({
"thought_process": "I analyzed the problem carefully.",
"solution_content": "The answer is 42."
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "I analyzed the problem carefully.");
assert_eq!(result.solution_content, "The answer is 42.");
}
// =========================================================================
// strip_thinking_prefix tests
// =========================================================================
#[test]
fn strip_thinking_prefix_final() {
let input = "final**Critique Integration:** The peer reviews...";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Critique Integration:** The peer reviews...");
}
#[test]
fn strip_thinking_prefix_analysis_assistant_final() {
let input = "analysisWe need to submit full proposal.assistantfinal**Critique Integration:** ...rest";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Critique Integration:** ...rest");
}
#[test]
fn strip_thinking_prefix_commentary() {
let input = "commentaryto=functions.submit_proposal json{\"thought_process\": \"test\"}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"thought_process\": \"test\"}");
}
#[test]
fn strip_thinking_prefix_clean_content_unchanged() {
let input = "**Critique Integration:** Normal content.";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Critique Integration:** Normal content.");
}
#[test]
fn strip_thinking_prefix_json_unchanged() {
let input = "{\"thought_process\": \"test\", \"solution_content\": \"x\"}";
let result = strip_thinking_prefix(input);
assert_eq!(result, input);
}
// =========================================================================
// StructuredBatchEvaluationResponse alias tests
// =========================================================================
/// Models use "candidate_evaluations" instead of "evaluations"
#[test]
fn batch_eval_alias_candidate_evaluations() {
let json = r#"{"candidate_evaluations": [{"candidate_id": "A", "endorsement_weight": 75, "stance": "agree", "claim_assessments": [], "disagreements": [], "category_scores": {"correctness": 80, "completeness": 75, "novelty": 70, "feasibility": 85, "evidence_quality": 78}}]}"#;
let result: StructuredBatchEvaluationResponse = serde_json::from_str(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert_eq!(result.evaluations[0].agent_id, "A");
}
/// Models use "candidates" instead of "evaluations"
#[test]
fn batch_eval_alias_candidates() {
let json = r#"{"candidates": [{"agent_id": "B", "endorsement_weight": 60, "stance": "disagree"}]}"#;
let result: StructuredBatchEvaluationResponse = serde_json::from_str(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert_eq!(result.evaluations[0].agent_id, "B");
}
/// Full gpt-oss MOMENTUM r2 eval payload with "claim_id" only in claim_assessments
/// (missing "claim" text — uses "verdict" + "claim_id" pattern)
#[test]
fn batch_eval_gpt_oss_claim_id_only_assessments() {
let json = r#"{"evaluations": [{"agent_id": "Candidate_C", "stance": "disagree", "claim_assessments": [{"claim_id": "C1", "verdict": "unverified"}, {"claim_id": "C2", "verdict": "contested"}, {"claim_id": "C3", "verdict": "verified"}], "disagreements": [{"claim_id": "C1", "proposal": "Equity allocation of 40%", "counter": "40% equity exceeds drawdown limit.", "confidence": "high"}], "category_scores": {"correctness": 40, "completeness": 30, "novelty": 20, "feasibility": 45, "evidence_quality": 30}, "endorsement_weight": 25}]}"#;
let result: StructuredBatchEvaluationResponse = serde_json::from_str(json).unwrap();
assert_eq!(result.evaluations[0].disagreements.len(), 1);
assert_eq!(
result.evaluations[0].disagreements[0].evaluator_position,
"40% equity exceeds drawdown limit."
);
assert_eq!(
result.evaluations[0].disagreements[0].proposal_claims,
"Equity allocation of 40%"
);
}
// =========================================================================
// write_failure_dump config precedence tests
// =========================================================================
/// Config "off" must suppress dumps without falling through to env var.
#[test]
fn dump_mode_config_off_suppresses_dumps() {
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "test-agent",
model_name: "model",
provider_id: "provider",
error: "some error",
session_id: None,
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: Some("content"),
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("off"),
});
// Config "off" should short-circuit and return None without
// checking the NSED_FAILURE_DUMPS env var.
assert!(result.is_none(), "config 'off' should suppress dumps");
}
/// Config None falls through to env var (no env var set = no dumps).
#[test]
fn dump_mode_config_none_no_env_var() {
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "test-agent",
model_name: "model",
provider_id: "provider",
error: "some error",
session_id: None,
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: Some("content"),
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: None,
});
// Without config and without env var, no dumps should be written.
assert!(result.is_none(), "no config + no env var = no dumps");
}
// =========================================================================
// Additional strip_scratchpad tests
// =========================================================================
#[test]
fn test_strip_scratchpad_nested_tags() {
// Nested scratchpad tags — strip_scratchpad finds the first <scratchpad>
// and the first </scratchpad> after it, removing that range. The outer
// closing tag remains as leftover text.
let input = "<scratchpad><scratchpad>inner</scratchpad>outer</scratchpad>after content";
let result = strip_scratchpad(input);
// The function strips from first <scratchpad> to first </scratchpad>,
// leaving "outer</scratchpad>after content" which gets trimmed.
assert!(!result.contains("<scratchpad>inner</scratchpad>"));
// The remaining text should include "after content".
assert!(result.contains("after content"));
}
#[test]
fn test_strip_scratchpad_empty() {
let result = strip_scratchpad("");
assert_eq!(result, "");
}
#[test]
fn test_strip_scratchpad_no_tags() {
let input = "This is plain text with no scratchpad tags at all.";
let result = strip_scratchpad(input);
assert_eq!(result, input);
}
#[test]
fn test_strip_scratchpad_missing_close() {
// Input with opening tag but no closing tag — should return as-is.
let input = "<scratchpad>content without closing tag";
let result = strip_scratchpad(input);
assert_eq!(result, input);
}
// =========================================================================
// truncate_for_dump tests
// =========================================================================
#[test]
fn test_truncate_for_dump() {
use super::truncate_for_dump;
// Short string — unchanged.
let short = "hello";
assert_eq!(truncate_for_dump(short, 100), "hello");
// Long string — truncated with indicator.
let long: String = "a".repeat(200);
let result = truncate_for_dump(&long, 50);
assert!(result.len() < long.len());
assert!(result.contains("... (truncated at 50 chars)"));
// The truncated prefix should be exactly 50 chars from the original.
let prefix_part = result
.split("\n... (truncated at 50 chars)")
.next()
.unwrap();
assert_eq!(prefix_part.chars().count(), 50);
// Exact boundary — no truncation.
let exact: String = "b".repeat(50);
assert_eq!(truncate_for_dump(&exact, 50), exact);
}
// =========================================================================
// Additional strip_working_memory tests
// =========================================================================
#[test]
fn test_strip_working_memory_nested() {
// Nested working memory tags — the while loop strips from first open to
// first close, then repeats.
let input =
"<working_memory>outer<working_memory>inner</working_memory>rest</working_memory>final";
let result = strip_working_memory(input);
// First iteration strips "<working_memory>outer<working_memory>inner</working_memory>"
// leaving "rest</working_memory>final". Second iteration finds no proper
// <working_memory> open tag before </working_memory>, so the loop ends.
// The key assertion: no <working_memory> open tags remain.
assert!(!result.contains("<working_memory>"));
assert!(result.contains("final"));
}
// =========================================================================
// strip_thinking_prefix with JSON tests
// =========================================================================
#[test]
fn test_strip_thinking_prefix_with_json() {
// The <think> tag stripping happens in the react_loop (not strip_thinking_prefix).
// strip_thinking_prefix handles leaked model tokens like "final", "analysis", "commentary".
// For <think> tags, the react_loop uses string replacement (lines 1647-1663).
// Here we verify that strip_thinking_prefix leaves JSON after known prefixes intact.
let input = r#"final{"score": 0.5}"#;
let result = strip_thinking_prefix(input);
assert_eq!(result, r#"{"score": 0.5}"#);
// Verify the result is valid JSON.
let val: serde_json::Value = serde_json::from_str(result).unwrap();
assert_eq!(val["score"], 0.5);
}
// =========================================================================
// deserialize_string_or_array_or_object — additional coverage
// =========================================================================
#[test]
fn deser_string_or_array_array_of_strings() {
let json = serde_json::json!({
"thought_process": ["Alpha", "Beta", "Gamma"],
"solution_content": "plain"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "Alpha\nBeta\nGamma");
assert_eq!(result.solution_content, "plain");
}
#[test]
fn deser_string_or_array_empty_array() {
let json = serde_json::json!({
"thought_process": [],
"solution_content": "content"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(
result.thought_process, "",
"Empty array should produce empty string"
);
}
#[test]
fn deser_string_or_array_nested_object() {
let json = serde_json::json!({
"thought_process": {
"step1": "Analyze requirements",
"step2": "Propose solution"
},
"solution_content": "answer"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
// Object is serialized to compact JSON string
assert!(result.thought_process.contains("step1"));
assert!(result.thought_process.contains("Analyze requirements"));
// Must be valid JSON
let _: serde_json::Value = serde_json::from_str(&result.thought_process).unwrap();
}
#[test]
fn deser_string_or_array_mixed_array() {
// Array containing both strings and non-string values (numbers, objects)
let json = serde_json::json!({
"thought_process": ["Step 1", 42, {"detail": "nested"}, true],
"solution_content": "done"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
let parts: Vec<&str> = result.thought_process.split('\n').collect();
assert_eq!(parts.len(), 4);
assert_eq!(parts[0], "Step 1");
assert_eq!(parts[1], "42");
assert!(parts[2].contains("nested"));
assert_eq!(parts[3], "true");
}
#[test]
fn deser_string_or_array_single_element_array() {
let json = serde_json::json!({
"thought_process": ["Only one element"],
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "Only one element");
}
#[test]
fn deser_string_or_array_number_value() {
// A bare number should be serialized to string via the catch-all branch
let json = serde_json::json!({
"thought_process": 12345,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "12345");
}
#[test]
fn deser_string_or_array_boolean_value() {
let json = serde_json::json!({
"thought_process": true,
"solution_content": false
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "true");
assert_eq!(result.solution_content, "false");
}
#[test]
fn deser_string_or_array_null_value() {
let json = serde_json::json!({
"thought_process": null,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "null");
}
#[test]
fn deser_string_or_array_deeply_nested_object() {
let json = serde_json::json!({
"thought_process": "simple",
"solution_content": {
"level1": {
"level2": {
"data": [1, 2, 3]
}
}
}
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert!(result.solution_content.contains("level2"));
assert!(result.solution_content.contains("[1,2,3]"));
}
// =========================================================================
// strip_thinking_prefix — comprehensive pattern coverage
// =========================================================================
#[test]
fn strip_thinking_prefix_assistant_then_json() {
// "assistant" followed by "{" should strip the prefix
let input = "analysisblahassistant{\"key\": \"value\"}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"key\": \"value\"}");
}
#[test]
fn strip_thinking_prefix_assistant_then_heading() {
// "assistant" followed by "#" should strip the prefix
let input = "some thinking tokensassistant# Heading";
let result = strip_thinking_prefix(input);
assert_eq!(result, "# Heading");
}
#[test]
fn strip_thinking_prefix_assistant_then_bold() {
// "assistant" followed by "**" should strip the prefix
let input = "reflectionassistant**Bold content**";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Bold content**");
}
#[test]
fn strip_thinking_prefix_assistant_then_final_then_bold() {
// "assistant" followed by "final" then actual content
let input = "thinkingassistantfinal**Real content here**";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Real content here**");
}
#[test]
fn strip_thinking_prefix_bare_final_with_heading() {
let input = "final# Asset Allocation";
let result = strip_thinking_prefix(input);
assert_eq!(result, "# Asset Allocation");
}
#[test]
fn strip_thinking_prefix_bare_final_with_json() {
let input = "final{\"solution\": \"test\"}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"solution\": \"test\"}");
}
#[test]
fn strip_thinking_prefix_commentary_json_with_space() {
let input = "commentaryto=functions.submit json {\"thought_process\": \"x\"}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"thought_process\": \"x\"}");
}
#[test]
fn strip_thinking_prefix_no_prefix_plain_text() {
let input = "This is normal content with no thinking tokens.";
let result = strip_thinking_prefix(input);
assert_eq!(result, "This is normal content with no thinking tokens.");
}
#[test]
fn strip_thinking_prefix_assistant_mid_word_no_strip() {
// "assistant" followed by a regular letter should NOT strip
let input = "The assistant helped me write code";
let result = strip_thinking_prefix(input);
// "assistant" is found, but after it is " helped" which doesn't start
// with final/{/**/# — so the "assistant" prefix is NOT stripped.
// "final" prefix check also doesn't match.
assert_eq!(result, "The assistant helped me write code");
}
#[test]
fn strip_thinking_prefix_empty_string() {
let result = strip_thinking_prefix("");
assert_eq!(result, "");
}
#[test]
fn strip_thinking_prefix_only_final() {
// "final" alone leaves empty string after stripping + trim
let result = strip_thinking_prefix("final");
assert_eq!(result, "");
}
#[test]
fn strip_thinking_prefix_whitespace_trimming() {
let input = "final **Content with leading spaces** ";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**Content with leading spaces**");
}
#[test]
fn strip_thinking_prefix_commentary_without_json_keyword() {
// "commentary" prefix without "json{" or "json " — should remain
let input = "commentarysome other text here";
let result = strip_thinking_prefix(input);
// The commentary branch looks for "json{" or "json " — neither is found,
// so the commentary prefix is not stripped. Trim applies.
assert_eq!(result, "commentarysome other text here");
}
// =========================================================================
// extract_evaluation_sections — additional edge cases
// =========================================================================
#[test]
fn extract_evaluation_sections_empty_input() {
let result = extract_evaluation_sections("");
assert_eq!(result, "", "Empty input should return empty string");
}
#[test]
fn extract_evaluation_sections_empty_tags() {
// Tags present but with empty content
let input = "<key_findings></key_findings>\n<strategy></strategy>";
let result = extract_evaluation_sections(input);
assert!(result.contains("<key_findings></key_findings>"));
assert!(result.contains("<strategy></strategy>"));
}
#[test]
fn extract_evaluation_sections_nested_xml_inside_tags() {
// XML-like content nested inside the sections
let input = "<key_findings><item>A</item><item>B</item></key_findings>";
let result = extract_evaluation_sections(input);
assert_eq!(
result,
"<key_findings><item>A</item><item>B</item></key_findings>"
);
}
#[test]
fn extract_evaluation_sections_whitespace_only_input() {
let input = " \n\n \t ";
let result = extract_evaluation_sections(input);
// No sections found, falls back to full text
assert_eq!(result, input);
}
#[test]
fn extract_evaluation_sections_duplicate_key_findings() {
// Only the first occurrence of each tag pair is extracted
let input =
"<key_findings>first</key_findings>\nsome gap\n<key_findings>second</key_findings>";
let result = extract_evaluation_sections(input);
// The function uses find() which returns the first match
assert!(result.contains("first"));
}
#[test]
fn extract_evaluation_sections_sections_with_surrounding_junk() {
let input = "IGNORE THIS PREAMBLE\n\n<key_findings>Core insight: volatility is rising</key_findings>\n\nMORE JUNK\n\n<strategy>Hedge with puts</strategy>\n\nTRAILING GARBAGE";
let result = extract_evaluation_sections(input);
assert!(result.contains("<key_findings>Core insight: volatility is rising</key_findings>"));
assert!(result.contains("<strategy>Hedge with puts</strategy>"));
assert!(!result.contains("IGNORE THIS PREAMBLE"));
assert!(!result.contains("MORE JUNK"));
assert!(!result.contains("TRAILING GARBAGE"));
}
#[test]
fn extract_evaluation_sections_strategy_before_key_findings() {
// Order in the output: key_findings first, then strategy (regardless of input order)
let input = "<strategy>plan first</strategy>\n<key_findings>findings second</key_findings>";
let result = extract_evaluation_sections(input);
assert!(result.contains("<key_findings>findings second</key_findings>"));
assert!(result.contains("<strategy>plan first</strategy>"));
// key_findings should come before strategy in the joined output
let kf_pos = result.find("<key_findings>").unwrap();
let st_pos = result.find("<strategy>").unwrap();
assert!(
kf_pos < st_pos,
"key_findings should appear before strategy in output"
);
}
// =========================================================================
// prune_failure_dirs — filesystem tests with temp directories
// =========================================================================
#[test]
fn prune_failure_dirs_no_op_under_limit() {
use super::prune_failure_dirs;
let tmp = std::env::temp_dir().join(format!("nsed_prune_test_noop_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// Create 3 directories, max is 5 — no pruning should happen
for i in 0..3 {
std::fs::create_dir_all(tmp.join(format!("dir_{i}"))).unwrap();
}
prune_failure_dirs(tmp.to_str().unwrap(), 5);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(
remaining.len(),
3,
"All 3 dirs should remain when under limit"
);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn prune_failure_dirs_removes_oldest_when_over_limit() {
use super::prune_failure_dirs;
let tmp = std::env::temp_dir().join(format!("nsed_prune_test_over_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// Create 5 directories
for i in 0..5 {
let dir = tmp.join(format!("dir_{i}"));
std::fs::create_dir_all(&dir).unwrap();
// Write a file so the directory is non-empty
std::fs::write(dir.join("marker.txt"), format!("dir {i}")).unwrap();
// Small sleep to ensure distinct mtimes on filesystems with coarse timestamps
std::thread::sleep(std::time::Duration::from_millis(50));
}
// Prune to max 2 — should remove 3 oldest
prune_failure_dirs(tmp.to_str().unwrap(), 2);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(
remaining.len(),
2,
"Should have 2 dirs remaining after pruning from 5"
);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn prune_failure_dirs_nonexistent_parent() {
use super::prune_failure_dirs;
// Should not panic when the parent directory doesn't exist
prune_failure_dirs("/tmp/nsed_nonexistent_prune_dir_12345", 5);
}
#[test]
fn prune_failure_dirs_exact_limit() {
use super::prune_failure_dirs;
let tmp =
std::env::temp_dir().join(format!("nsed_prune_test_exact_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// Create exactly 3 directories, max is 3 — no pruning
for i in 0..3 {
std::fs::create_dir_all(tmp.join(format!("dir_{i}"))).unwrap();
}
prune_failure_dirs(tmp.to_str().unwrap(), 3);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(remaining.len(), 3, "Exactly at limit — no pruning");
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn prune_failure_dirs_max_zero_removes_all() {
use super::prune_failure_dirs;
let tmp = std::env::temp_dir().join(format!("nsed_prune_test_zero_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
for i in 0..3 {
std::fs::create_dir_all(tmp.join(format!("dir_{i}"))).unwrap();
}
prune_failure_dirs(tmp.to_str().unwrap(), 0);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(remaining.len(), 0, "Max 0 should remove all directories");
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn prune_failure_dirs_ignores_files() {
use super::prune_failure_dirs;
let tmp =
std::env::temp_dir().join(format!("nsed_prune_test_files_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// Create 3 directories and 2 regular files
for i in 0..3 {
std::fs::create_dir_all(tmp.join(format!("dir_{i}"))).unwrap();
}
std::fs::write(tmp.join("file_a.txt"), "a").unwrap();
std::fs::write(tmp.join("file_b.txt"), "b").unwrap();
prune_failure_dirs(tmp.to_str().unwrap(), 1);
// Only directories should be counted/pruned, not files
let remaining_dirs: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(remaining_dirs.len(), 1, "Should prune down to 1 directory");
// Files should be untouched
assert!(tmp.join("file_a.txt").exists());
assert!(tmp.join("file_b.txt").exists());
let _ = std::fs::remove_dir_all(&tmp);
}
// =========================================================================
// write_failure_dump — full integration test with temp directory
// =========================================================================
#[test]
#[serial]
fn write_failure_dump_creates_file_with_correct_format() {
// write_failure_dump creates "failures/<job_dir>" relative to CWD.
// We test with config "1" and session_id to verify the format.
let result = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "test-agent",
model_name: "gpt-4",
provider_id: "openai",
error: "JSON parse failed: unexpected token",
session_id: Some("abc12345"),
phase: Some("propose"),
round: Some(2),
attempt: Some(3),
finish_reason: Some("stop"),
input_tokens: Some(500),
output_tokens: Some(200),
response_content: Some("This is the raw LLM response"),
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
// Should have created a file
assert!(result.is_some(), "Dump should be created with config '1'");
let filename = result.unwrap();
assert!(
filename.contains("abc12345"),
"Filename should contain session ID prefix"
);
assert!(
filename.contains("parse_error_r2.md"),
"Filename should contain kind and round"
);
// Read and verify content
let content = std::fs::read_to_string(&filename).unwrap();
assert!(
content.contains("# parse_error (attempt 3)"),
"Should contain header"
);
assert!(
content.contains("| agent | test-agent |"),
"Should contain agent name"
);
assert!(
content.contains("| model | gpt-4 |"),
"Should contain model name"
);
assert!(
content.contains("| phase | propose |"),
"Should contain phase"
);
assert!(content.contains("| round | 2 |"), "Should contain round");
assert!(content.contains("## Error"), "Should contain error section");
assert!(
content.contains("JSON parse failed"),
"Should contain error text"
);
assert!(
content.contains("## LLM Response"),
"Should contain response section"
);
assert!(
content.contains("This is the raw LLM response"),
"Should contain response content"
);
// Without "full" mode, should NOT contain system prompt section
assert!(
content.contains("NSED_FAILURE_DUMPS=full"),
"Should contain hint about full mode"
);
// Cleanup
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_full_mode_includes_system_prompt() {
let result = write_failure_dump(FailureDumpParams {
kind: "api_error",
agent_name: "full-test-agent",
model_name: "model",
provider_id: "provider",
error: "timeout",
session_id: Some("fulltest1"),
phase: None,
round: Some(1),
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: Some("You are a helpful assistant."),
request_body: Some("{\"model\": \"test\"}"),
messages: None,
failure_dumps_config: Some("full"),
});
assert!(result.is_some(), "Full mode dump should be created");
let filename = result.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
assert!(
content.contains("## System Prompt"),
"Full mode should include system prompt"
);
assert!(content.contains("You are a helpful assistant."));
assert!(
content.contains("## Request Body"),
"Full mode should include request body"
);
// Cleanup
let _ = std::fs::remove_dir_all("failures");
}
#[test]
fn write_failure_dump_invalid_mode_returns_none() {
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "agent",
model_name: "model",
provider_id: "prov",
error: "err",
session_id: None,
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("maybe"), // Not "1", "on", or "full"
});
assert!(result.is_none(), "Invalid mode 'maybe' should return None");
}
// =========================================================================
// truncate_for_dump — additional edge cases
// =========================================================================
#[test]
fn test_truncate_for_dump_empty() {
use super::truncate_for_dump;
assert_eq!(truncate_for_dump("", 100), "");
}
#[test]
fn test_truncate_for_dump_unicode() {
use super::truncate_for_dump;
// Unicode string: each char may be multi-byte but we count chars not bytes
let input = "日本語テストデータ"; // 9 chars
let result = truncate_for_dump(input, 5);
assert!(result.contains("... (truncated at 5 chars)"));
// Should contain exactly 5 chars from the original
let prefix = result.split("\n... (truncated at 5 chars)").next().unwrap();
assert_eq!(prefix.chars().count(), 5);
assert_eq!(prefix, "日本語テス");
}
#[test]
fn test_truncate_for_dump_one_char_over() {
use super::truncate_for_dump;
let input = "abcdef"; // 6 chars
let result = truncate_for_dump(input, 5);
assert!(result.contains("... (truncated at 5 chars)"));
}
// =========================================================================
// strip_thinking_prefix — additional branch coverage
// =========================================================================
#[test]
fn strip_thinking_prefix_commentary_with_json_brace() {
// "commentary" followed by "json{" path
let input = "commentaryto=functions.submit_proposal json{\"key\": 1}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"key\": 1}");
}
#[test]
fn strip_thinking_prefix_assistant_followed_by_final_then_json() {
let input = "random tokensassistantfinal{\"data\": true}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"data\": true}");
}
// =========================================================================
// Transport error detection — validate retry eligibility
// =========================================================================
/// Helper that mirrors the production classifier in
/// `generate_structured_output`'s Err arm. Tests use this to drive
/// the same downcast path the retry loop uses, so renaming or
/// reshaping `LlmError::Transport` is caught at compile time.
fn is_transport_anyhow(err: &anyhow::Error) -> bool {
use crate::telemetry::LlmError;
matches!(err.downcast_ref::<LlmError>(), Some(LlmError::Transport(_)))
}
/// `LlmError::Transport` flowing up through `anyhow::Error::from`
/// must classify as retryable on the inner error path. A bare
/// `anyhow!("plain string")` (no LlmError chain) must not.
#[test]
fn transport_error_detection_covers_known_patterns() {
use crate::telemetry::LlmError;
let causes: Vec<Box<dyn std::error::Error + Send + Sync>> = vec![
Box::new(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"unexpected EOF during chunk size line",
)),
Box::new(std::io::Error::new(
std::io::ErrorKind::ConnectionReset,
"connection reset by peer",
)),
Box::new(std::io::Error::new(
std::io::ErrorKind::BrokenPipe,
"broken pipe",
)),
Box::new(std::io::Error::new(
std::io::ErrorKind::TimedOut,
"request timed out after 30s",
)),
];
for cause in causes {
let display = cause.to_string();
let err: anyhow::Error = LlmError::Transport(cause).into();
assert!(
is_transport_anyhow(&err),
"LlmError::Transport({display:?}) must classify as retryable transport"
);
}
}
/// Non-transport `LlmError` variants and bare `anyhow` errors must
/// not classify as retryable transport.
#[test]
fn non_transport_errors_not_retryable() {
use crate::telemetry::LlmError;
let non_transport: Vec<anyhow::Error> = vec![
// Other LlmError variants
LlmError::ServerError { status: 500 }.into(),
LlmError::PaymentRequired { status: 402 }.into(),
LlmError::ContextOverflow {
tokens: 5000,
limit: 4096,
}
.into(),
LlmError::Parse(Box::new(std::io::Error::other("parse failed"))).into(),
LlmError::RateLimit {
retry_after_ms: None,
status: 429,
}
.into(),
// Plain anyhow chain — no LlmError in the source path
anyhow::anyhow!("Failed to parse structured output after 4 attempts"),
anyhow::anyhow!("No choice in LLM response"),
];
for err in non_transport {
assert!(
!is_transport_anyhow(&err),
"Error should NOT classify as retryable transport: {err:#}"
);
}
}
/// Empty response parsing should fail with EOF error — this is the exact
/// error that gpt-oss-120b triggers when it returns empty content with
/// finish_reason=Stop (reasoning tokens consumed, no visible output).
#[test]
fn empty_response_triggers_eof_parse_error() {
let result = serde_json::from_str::<StructuredProposalResponse>("");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.is_eof(),
"Empty string should trigger EOF error, got: {}",
err
);
}
/// Empty response should also fail for batch evaluation parsing.
#[test]
fn empty_response_triggers_eof_parse_error_evaluation() {
let result = serde_json::from_str::<StructuredBatchEvaluationResponse>("");
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.is_eof(),
"Empty string should trigger EOF error for evaluation, got: {}",
err
);
}
/// Verify the empty-response retry message tells the model to USE the tool.
/// This is the specific feedback sent when gpt-oss-120b returns empty content.
#[test]
fn empty_response_retry_message_is_actionable() {
let cleaned_json = "";
let error_msg = if cleaned_json.trim().is_empty() {
"Your response was EMPTY — no content was returned. You MUST call the tool with a JSON argument. Do NOT just think about the answer — you must actually output the tool call with the required fields.".to_string()
} else {
"other".to_string()
};
assert!(
error_msg.contains("EMPTY"),
"Error message should clearly indicate empty response"
);
assert!(
error_msg.contains("MUST call the tool"),
"Error message should instruct the model to use the tool"
);
}
// =========================================================================
// deserialize_string_or_array_or_object — float value
// =========================================================================
#[test]
fn deser_string_or_array_float_value() {
let val = 7.89_f64;
let json = serde_json::json!({
"thought_process": val,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "7.89");
}
#[test]
fn deser_string_or_array_array_with_numbers() {
// Array containing non-string values should be converted via to_string()
let json = serde_json::json!({
"thought_process": [1, 2.5, "text"],
"solution_content": "y"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
let parts: Vec<&str> = result.thought_process.split('\n').collect();
assert_eq!(parts[0], "1");
assert_eq!(parts[1], "2.5");
assert_eq!(parts[2], "text");
}
// =========================================================================
// truncate_for_dump — boundary and edge cases
// =========================================================================
#[test]
fn truncate_for_dump_exact_limit() {
use super::truncate_for_dump;
// String with exactly max_chars — no truncation
let input = "abcde"; // 5 chars
let result = truncate_for_dump(input, 5);
assert_eq!(result, "abcde");
assert!(!result.contains("truncated"));
}
#[test]
fn truncate_for_dump_max_one_char() {
use super::truncate_for_dump;
let result = truncate_for_dump("hello", 1);
assert!(result.starts_with('h'));
assert!(result.contains("... (truncated at 1 chars)"));
}
#[test]
fn truncate_for_dump_max_zero() {
use super::truncate_for_dump;
// max_chars = 0 should produce an empty prefix with truncation note
let result = truncate_for_dump("hello", 0);
assert!(result.contains("... (truncated at 0 chars)"));
}
#[test]
fn truncate_for_dump_multibyte_boundary() {
use super::truncate_for_dump;
// 4 emoji chars, each is 4 bytes
let input = "\u{1F600}\u{1F601}\u{1F602}\u{1F603}"; // 4 chars
let result = truncate_for_dump(input, 2);
assert!(result.contains("... (truncated at 2 chars)"));
// The prefix should have exactly 2 emoji chars
let prefix = result.split("\n... (truncated at 2 chars)").next().unwrap();
assert_eq!(prefix.chars().count(), 2);
}
#[test]
fn truncate_for_dump_newlines_preserved() {
use super::truncate_for_dump;
let input = "line1\nline2\nline3\nline4";
let result = truncate_for_dump(input, 12);
// 12 chars = "line1\nline2\n" (12 chars)
assert!(result.contains("line1\nline2\n"));
assert!(result.contains("... (truncated at 12 chars)"));
}
// =========================================================================
// write_failure_dump — additional branch coverage
// =========================================================================
#[test]
#[serial]
fn write_failure_dump_off_config_returns_none() {
// Explicit "off" config should short-circuit and return None
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "agent",
model_name: "model",
provider_id: "prov",
error: "err",
session_id: None,
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("off"),
});
assert!(result.is_none(), "'off' config should return None");
}
#[test]
#[serial]
fn write_failure_dump_empty_config_returns_none() {
// Empty config string should return None
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "agent",
model_name: "model",
provider_id: "prov",
error: "err",
session_id: None,
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some(""),
});
assert!(result.is_none(), "Empty config should return None");
}
#[test]
#[serial]
fn write_failure_dump_on_config_creates_file() {
let result = write_failure_dump(FailureDumpParams {
kind: "api_error",
agent_name: "on-test-agent",
model_name: "test-model",
provider_id: "test-provider",
error: "connection timeout",
session_id: Some("ontest123"),
phase: Some("evaluate"),
round: Some(3),
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("on"),
});
assert!(result.is_some(), "'on' config should create a dump");
let filename = result.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
assert!(content.contains("# api_error"));
assert!(content.contains("| agent | on-test-agent |"));
assert!(content.contains("| phase | evaluate |"));
assert!(content.contains("connection timeout"));
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_append_mode_adds_separator() {
// First write
let result1 = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "append-agent",
model_name: "model",
provider_id: "prov",
error: "first error",
session_id: Some("appendtst"),
phase: None,
round: Some(1),
attempt: Some(1),
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
assert!(result1.is_some());
// Second write to same file (same session, kind, round)
let result2 = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "append-agent",
model_name: "model",
provider_id: "prov",
error: "second error",
session_id: Some("appendtst"),
phase: None,
round: Some(1),
attempt: Some(2),
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
assert!(result2.is_some());
let filename = result2.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
// Append mode should add "---" separator
assert!(content.contains("---"), "Append should add separator");
assert!(
content.contains("first error"),
"Should contain first entry"
);
assert!(
content.contains("second error"),
"Should contain second entry"
);
assert!(
content.contains("(attempt 1)"),
"First attempt label should be present"
);
assert!(
content.contains("(attempt 2)"),
"Second attempt label should be present"
);
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_no_session_id_uses_timestamp_dir() {
let result = write_failure_dump(FailureDumpParams {
kind: "api_error",
agent_name: "no-session-agent",
model_name: "model",
provider_id: "prov",
error: "err",
session_id: None, // No session ID
phase: None,
round: None,
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
assert!(result.is_some());
let filename = result.unwrap();
// Without session_id, directory name uses timestamp_agentname pattern
assert!(filename.contains("no-session-agent"));
// Round defaults to 0
assert!(filename.contains("_r0.md"));
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_full_mode_with_messages() {
use async_openai::types::{ChatCompletionRequestMessage, ChatCompletionRequestUserMessage};
let user_msg =
ChatCompletionRequestMessage::User(ChatCompletionRequestUserMessage::from("Hello!"));
let messages = vec![user_msg];
let result = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "msg-agent",
model_name: "gpt-4",
provider_id: "openai",
error: "parse failed",
session_id: Some("msgtest12"),
phase: None,
round: Some(1),
attempt: None,
finish_reason: Some("length"),
input_tokens: Some(1000),
output_tokens: Some(500),
response_content: Some("partial response"),
system_prompt: Some("You are helpful."),
request_body: Some("{\"model\": \"gpt-4\"}"),
messages: Some(&messages),
failure_dumps_config: Some("full"),
});
assert!(result.is_some());
let filename = result.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
// Full mode: system prompt, request body, AND messages
assert!(content.contains("## System Prompt"));
assert!(content.contains("You are helpful."));
assert!(content.contains("## Request Body"));
assert!(content.contains("## Messages"));
// Metadata
assert!(content.contains("| finish_reason | length |"));
assert!(content.contains("| input_tokens | 1000 |"));
assert!(content.contains("| output_tokens | 500 |"));
// LLM Response section
assert!(content.contains("## LLM Response"));
assert!(content.contains("partial response"));
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_full_mode_append_skips_context() {
// First write in full mode — should include system prompt
let _r1 = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "ctx-agent",
model_name: "model",
provider_id: "prov",
error: "first",
session_id: Some("ctxtest12"),
phase: None,
round: Some(1),
attempt: Some(1),
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: Some("System prompt content here"),
request_body: Some("{\"body\": true}"),
messages: None,
failure_dumps_config: Some("full"),
});
// Second write (append) — should NOT repeat system prompt/request body
let r2 = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "ctx-agent",
model_name: "model",
provider_id: "prov",
error: "second",
session_id: Some("ctxtest12"),
phase: None,
round: Some(1),
attempt: Some(2),
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: Some("System prompt content here"),
request_body: Some("{\"body\": true}"),
messages: None,
failure_dumps_config: Some("full"),
});
let filename = r2.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
// System prompt should appear exactly once (from first write only)
let sys_count = content.matches("## System Prompt").count();
assert_eq!(
sys_count, 1,
"System Prompt should appear only once (first write), found {}",
sys_count
);
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_special_chars_in_agent_name() {
let result = write_failure_dump(FailureDumpParams {
kind: "test",
agent_name: "agent/with (special) chars\\here",
model_name: "model",
provider_id: "prov",
error: "err",
session_id: Some("special1"),
phase: None,
round: Some(1),
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: None,
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
assert!(result.is_some());
let filename = result.unwrap();
// Special chars should be sanitized to underscores
assert!(
!filename.contains('/') || filename.starts_with("failures/"),
"Agent name special chars should be sanitized"
);
assert!(!filename.contains('('));
assert!(!filename.contains(')'));
let _ = std::fs::remove_dir_all("failures");
}
#[test]
#[serial]
fn write_failure_dump_response_truncation() {
// Verify that long response content is truncated at 4000 chars
let long_response = "X".repeat(5000);
let result = write_failure_dump(FailureDumpParams {
kind: "parse_error",
agent_name: "trunc-agent",
model_name: "model",
provider_id: "prov",
error: "parse error",
session_id: Some("trunctest"),
phase: None,
round: Some(1),
attempt: None,
finish_reason: None,
input_tokens: None,
output_tokens: None,
response_content: Some(&long_response),
system_prompt: None,
request_body: None,
messages: None,
failure_dumps_config: Some("1"),
});
assert!(result.is_some());
let filename = result.unwrap();
let content = std::fs::read_to_string(&filename).unwrap();
// Response should be truncated
assert!(content.contains("truncated at 4000 chars"));
// The full 5000-char response should NOT appear
assert!(!content.contains(&long_response));
let _ = std::fs::remove_dir_all("failures");
}
// =========================================================================
// prune_failure_dirs — edge case: mixed files and empty subdirs
// =========================================================================
#[test]
fn prune_failure_dirs_with_nested_content() {
use super::prune_failure_dirs;
let tmp =
std::env::temp_dir().join(format!("nsed_prune_test_nested_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// Create 4 directories, each with files inside
for i in 0..4 {
let dir = tmp.join(format!("job_{i}"));
std::fs::create_dir_all(&dir).unwrap();
std::fs::write(dir.join("parse_error_r1.md"), format!("error {i}")).unwrap();
// Stagger mtimes slightly
std::thread::sleep(std::time::Duration::from_millis(50));
}
prune_failure_dirs(tmp.to_str().unwrap(), 2);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(remaining.len(), 2, "Should prune down to 2 directories");
let _ = std::fs::remove_dir_all(&tmp);
}
// =========================================================================
// StructuredProposalResponse — additional deserialization edge cases
// =========================================================================
#[test]
fn deser_proposal_object_solution_content() {
// When solution_content is an object (not a string), it should be serialized
let json = serde_json::json!({
"thought_process": "thinking",
"solution_content": {"code": "fn main() {}", "language": "rust"}
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert!(result.solution_content.contains("code"));
assert!(result.solution_content.contains("fn main()"));
}
#[test]
fn deser_proposal_array_solution_content() {
let json = serde_json::json!({
"thought_process": "thinking",
"solution_content": ["Part 1", "Part 2", "Part 3"]
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert!(result.solution_content.contains("Part 1"));
assert!(result.solution_content.contains("Part 2"));
assert!(result.solution_content.contains("Part 3"));
// Parts should be joined with newlines
let parts: Vec<&str> = result.solution_content.split('\n').collect();
assert_eq!(parts.len(), 3);
}
// =========================================================================
// StructuredBatchEvaluationResponse — alias coverage
// =========================================================================
#[test]
fn deser_batch_eval_candidate_evaluations_alias() {
// "candidate_evaluations" alias should work
let json = serde_json::json!({
"candidate_evaluations": [
{"agent_id": "a1", "endorsement_weight": 80.0}
]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert_eq!(result.evaluations[0].agent_id, "a1");
assert_eq!(result.evaluations[0].endorsement_weight, 80.0);
}
#[test]
fn deser_batch_eval_candidates_alias() {
// "candidates" alias should work
let json = serde_json::json!({
"candidates": [
{"candidate_id": "c1", "score": 65.0, "justification": "Good work"}
]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert_eq!(result.evaluations[0].agent_id, "c1");
assert_eq!(result.evaluations[0].endorsement_weight, 65.0);
assert_eq!(
result.evaluations[0].justification,
Some("Good work".to_string())
);
}
#[test]
fn deser_batch_eval_with_all_optional_fields() {
let json = serde_json::json!({
"evaluations": [{
"agent_id": "agent_1",
"endorsement_weight": 75.0,
"justification": "Solid approach",
"is_final_solution": true,
"stance": "agree",
"claim_assessments": [{
"claim": "Memory safety is guaranteed",
"verdict": "verified"
}],
"disagreements": [{
"what_they_claim": "Always faster",
"what_you_believe": "Not always",
"confidence": "high"
}],
"category_scores": {
"correctness": 80.0,
"completeness": 70.0,
"novelty": 60.0,
"feasibility": 90.0,
"evidence_quality": 75.0
}
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert!(result.evaluations[0].is_final_solution);
assert!(result.evaluations[0].category_scores.is_some());
}
// =========================================================================
// strip_thinking_prefix — additional branches
// =========================================================================
#[test]
fn strip_thinking_prefix_commentary_with_json_space() {
// "commentary...json {..." path
let input = "commentaryto=functions.submit_proposal json {\"key\": 1}";
let result = strip_thinking_prefix(input);
assert_eq!(result, "{\"key\": 1}");
}
#[test]
fn strip_thinking_prefix_commentary_no_json_unchanged() {
// "commentary" prefix without "json" keyword — falls through, returns trimmed
let input = "commentary some random text without a special keyword";
let result = strip_thinking_prefix(input);
// No "json{" or "json " found, so the full string (trimmed) is returned
assert_eq!(result, input);
}
#[test]
fn strip_thinking_prefix_assistant_then_hash() {
// "assistant" followed by '#' should strip
let input = "analysis tokens here assistant# Solution Header";
let result = strip_thinking_prefix(input);
assert_eq!(result, "# Solution Header");
}
// =========================================================================
// strip_scratchpad — additional edge cases for coverage
// =========================================================================
#[test]
fn strip_scratchpad_only_text_before() {
// Text only before scratchpad — should trim trailing whitespace from before
let input = "Important preamble text\n<scratchpad>scratch content</scratchpad>";
let result = strip_scratchpad(input);
assert_eq!(result, "Important preamble text");
assert!(!result.contains("scratch content"));
}
#[test]
fn strip_scratchpad_only_text_after() {
// No text before, only text after
let input = "<scratchpad>notes</scratchpad>\nAfter the scratchpad";
let result = strip_scratchpad(input);
assert_eq!(result, "After the scratchpad");
}
#[test]
fn strip_scratchpad_whitespace_between() {
// Whitespace around scratchpad block — should be trimmed
let input = "Before \n\n <scratchpad>internal</scratchpad> \n\n After";
let result = strip_scratchpad(input);
assert_eq!(result, "Before\n\nAfter");
}
#[test]
fn strip_scratchpad_empty_tags() {
let input = "Text before <scratchpad></scratchpad> text after";
let result = strip_scratchpad(input);
assert_eq!(result, "Text before\n\ntext after");
}
#[test]
fn strip_scratchpad_closing_tag_only() {
// Closing tag without opening tag — no match, returns as-is
let input = "</scratchpad>Content after closing tag";
let result = strip_scratchpad(input);
assert_eq!(result, input);
}
// =========================================================================
// strip_working_memory — additional edge cases
// =========================================================================
#[test]
fn strip_working_memory_empty_tags() {
let input = "Before <working_memory></working_memory> After";
let result = strip_working_memory(input);
assert_eq!(result, "Before\n\nAfter");
}
#[test]
fn strip_working_memory_only_working_memory() {
let input = "<working_memory>all content is ephemeral</working_memory>";
let result = strip_working_memory(input);
assert_eq!(result, "");
}
#[test]
fn strip_working_memory_three_blocks() {
let input = "<working_memory>a</working_memory>\n\ntext1\n\n<working_memory>b</working_memory>\n\ntext2\n\n<working_memory>c</working_memory>";
let result = strip_working_memory(input);
assert!(!result.contains("<working_memory>"));
assert!(result.contains("text1"));
assert!(result.contains("text2"));
}
#[test]
fn strip_working_memory_with_whitespace_only_content() {
let input = "<working_memory> \n\t </working_memory>";
let result = strip_working_memory(input);
assert_eq!(result, "");
}
// =========================================================================
// extract_evaluation_sections — additional edge cases
// =========================================================================
#[test]
fn extract_evaluation_sections_one_closed_one_unclosed() {
// key_findings is properly closed, strategy is unclosed
let input = "<key_findings>data here</key_findings>\n<strategy>unclosed";
let result = extract_evaluation_sections(input);
// Only key_findings should be extracted (strategy is unclosed, so not captured)
assert_eq!(result, "<key_findings>data here</key_findings>");
assert!(!result.contains("unclosed"));
}
#[test]
fn extract_evaluation_sections_only_closing_tags() {
// Closing tags without opening tags — no sections found
let input = "</key_findings></strategy>some text";
let result = extract_evaluation_sections(input);
assert_eq!(result, input, "Should fall back to full text");
}
#[test]
fn extract_evaluation_sections_interleaved_with_working_memory() {
// Test realistic scenario with all three section types
let input = "<key_findings>Important findings</key_findings>\n<working_memory>ephemeral</working_memory>\n<strategy>Long-term plan</strategy>";
let result = extract_evaluation_sections(input);
assert!(result.contains("<key_findings>Important findings</key_findings>"));
assert!(result.contains("<strategy>Long-term plan</strategy>"));
// working_memory should NOT appear in extracted sections
assert!(!result.contains("working_memory"));
assert!(!result.contains("ephemeral"));
}
// =========================================================================
// strip_thinking_prefix — remaining uncovered branches
// =========================================================================
#[test]
fn strip_thinking_prefix_commentary_only() {
// "commentary" without any "json" keyword at all
let input = "commentary";
let result = strip_thinking_prefix(input);
assert_eq!(result, "commentary");
}
#[test]
fn strip_thinking_prefix_assistant_mid_content_not_at_start() {
// "assistant" found but followed by something that doesn't match the pattern
let input = "My assistant model works well";
let result = strip_thinking_prefix(input);
assert_eq!(result, "My assistant model works well");
}
#[test]
fn strip_thinking_prefix_final_with_leading_whitespace() {
let input = " final **Bold** ";
let result = strip_thinking_prefix(input);
// Leading whitespace is trimmed first, then "final" prefix is detected
// and stripped, then trailing trim produces the final result.
assert_eq!(result, "**Bold**");
}
#[test]
fn strip_thinking_prefix_multiple_assistant_occurrences() {
// find() returns the FIRST occurrence of "assistant"
let input = "analysisassistantfinal**Content**assistantmore";
let result = strip_thinking_prefix(input);
// First "assistant" is found, followed by "final" → strip to "final**Content**assistantmore"
// Then "final" prefix is stripped → "**Content**assistantmore"
assert_eq!(result, "**Content**assistantmore");
}
// =========================================================================
// prune_failure_dirs — single directory
// =========================================================================
#[test]
fn prune_failure_dirs_single_dir_over_limit() {
use super::prune_failure_dirs;
let tmp =
std::env::temp_dir().join(format!("nsed_prune_test_single_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
std::fs::create_dir_all(tmp.join("only_dir")).unwrap();
// Max 0 means remove all
prune_failure_dirs(tmp.to_str().unwrap(), 0);
let remaining: Vec<_> = std::fs::read_dir(&tmp)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().map(|ft| ft.is_dir()).unwrap_or(false))
.collect();
assert_eq!(
remaining.len(),
0,
"Single dir should be removed when max=0"
);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn prune_failure_dirs_empty_dir() {
use super::prune_failure_dirs;
let tmp =
std::env::temp_dir().join(format!("nsed_prune_test_empty_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
// No directories inside — should be a no-op
prune_failure_dirs(tmp.to_str().unwrap(), 5);
// Just verify no panic occurred
let _ = std::fs::remove_dir_all(&tmp);
}
// =========================================================================
// deserialize_string_or_array_or_object — additional edge cases
// =========================================================================
#[test]
fn deser_string_or_array_empty_string() {
let json = serde_json::json!({
"thought_process": "",
"solution_content": ""
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "");
assert_eq!(result.solution_content, "");
}
#[test]
fn deser_string_or_array_whitespace_only_string() {
let json = serde_json::json!({
"thought_process": " \n\t ",
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, " \n\t ");
}
#[test]
fn deser_string_or_array_array_of_empty_strings() {
let json = serde_json::json!({
"thought_process": ["", "", ""],
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "\n\n");
}
// =========================================================================
// StructuredBatchEvaluationResponse — edge cases
// =========================================================================
#[test]
fn batch_eval_empty_evaluations_array() {
let json = serde_json::json!({ "evaluations": [] });
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert!(result.evaluations.is_empty());
}
#[test]
fn batch_eval_minimal_item() {
// Minimum required fields
let json = serde_json::json!({
"evaluations": [{
"agent_id": "a",
"endorsement_weight": 50.0
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations.len(), 1);
assert_eq!(result.evaluations[0].agent_id, "a");
assert_eq!(result.evaluations[0].endorsement_weight, 50.0);
assert!(result.evaluations[0].justification.is_none());
assert!(!result.evaluations[0].is_final_solution);
assert!(result.evaluations[0].stance.is_none());
assert!(result.evaluations[0].claim_assessments.is_empty());
assert!(result.evaluations[0].disagreements.is_empty());
assert!(result.evaluations[0].category_scores.is_none());
}
#[test]
fn batch_eval_candidate_id_alias_for_agent_id() {
let json = serde_json::json!({
"evaluations": [{
"candidate_id": "candidate_x",
"endorsement_weight": 70.0
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations[0].agent_id, "candidate_x");
}
#[test]
fn batch_eval_score_alias_for_endorsement_weight() {
let json = serde_json::json!({
"evaluations": [{
"agent_id": "b",
"score": 85.0
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations[0].endorsement_weight, 85.0);
}
// =========================================================================
// truncate_for_dump — zero-length input
// =========================================================================
#[test]
fn truncate_for_dump_max_exceeds_length() {
use super::truncate_for_dump;
let input = "short";
let result = truncate_for_dump(input, 1000);
assert_eq!(result, "short");
}
// =========================================================================
// truncate_for_dump — additional edge cases
// =========================================================================
#[test]
fn truncate_for_dump_empty_input() {
use super::truncate_for_dump;
let result = truncate_for_dump("", 100);
assert_eq!(result, "");
}
#[test]
fn truncate_for_dump_exact_length() {
use super::truncate_for_dump;
let input = "12345";
let result = truncate_for_dump(input, 5);
assert_eq!(result, "12345");
}
#[test]
fn truncate_for_dump_one_over() {
use super::truncate_for_dump;
let input = "123456";
let result = truncate_for_dump(input, 5);
assert!(result.starts_with("12345"));
assert!(result.contains("truncated at 5 chars"));
}
#[test]
fn truncate_for_dump_unicode() {
use super::truncate_for_dump;
// Each emoji is 1 char but multiple bytes
let input = "\u{1F600}\u{1F601}\u{1F602}\u{1F603}\u{1F604}";
assert_eq!(input.chars().count(), 5);
let result = truncate_for_dump(input, 3);
assert!(result.starts_with("\u{1F600}\u{1F601}\u{1F602}"));
assert!(result.contains("truncated at 3 chars"));
}
#[test]
fn truncate_for_dump_zero_max() {
use super::truncate_for_dump;
let input = "something";
let result = truncate_for_dump(input, 0);
assert!(result.contains("truncated at 0 chars"));
assert!(!result.starts_with('s'));
}
// =========================================================================
// strip_thinking_prefix — commentary branches
// =========================================================================
#[test]
fn strip_thinking_prefix_commentary_json_brace() {
let input = r#"commentaryto=functions.submit_proposal json{"solution": "42"}"#;
let result = strip_thinking_prefix(input);
assert_eq!(result, r#"{"solution": "42"}"#);
}
#[test]
fn strip_thinking_prefix_commentary_json_space() {
let input = r#"commentaryto=functions.submit_proposal json {"solution": "42"}"#;
let result = strip_thinking_prefix(input);
assert_eq!(result, r#"{"solution": "42"}"#);
}
#[test]
fn strip_thinking_prefix_final_bold() {
let input = "final**My bold answer**";
let result = strip_thinking_prefix(input);
assert_eq!(result, "**My bold answer**");
}
#[test]
fn strip_thinking_prefix_final_heading() {
let input = "final# Heading";
let result = strip_thinking_prefix(input);
assert_eq!(result, "# Heading");
}
#[test]
fn strip_thinking_prefix_assistant_final_json() {
let input = r#"analysisThinking about itassistantfinal{"key": "val"}"#;
let result = strip_thinking_prefix(input);
assert_eq!(result, r#"{"key": "val"}"#);
}
#[test]
fn strip_thinking_prefix_assistant_json_direct() {
let input = r#"analysisassistant{"tool": true}"#;
let result = strip_thinking_prefix(input);
assert_eq!(result, r#"{"tool": true}"#);
}
#[test]
fn strip_thinking_prefix_plain_text() {
let input = "Just regular content with no prefix tokens";
let result = strip_thinking_prefix(input);
assert_eq!(result, input);
}
// =========================================================================
// StructuredBatchEvaluationResponse — more alias & field tests
// =========================================================================
#[test]
fn batch_eval_with_justification() {
let json = serde_json::json!({
"evaluations": [{
"agent_id": "a1",
"endorsement_weight": 90.0,
"justification": "Strong argument with evidence"
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(
result.evaluations[0].justification.as_deref(),
Some("Strong argument with evidence")
);
}
#[test]
fn batch_eval_with_is_final_solution_true() {
let json = serde_json::json!({
"evaluations": [{
"agent_id": "a1",
"endorsement_weight": 95.0,
"is_final_solution": true
}]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert!(result.evaluations[0].is_final_solution);
}
#[test]
fn batch_eval_multiple_items() {
let json = serde_json::json!({
"evaluations": [
{"agent_id": "a", "endorsement_weight": 80.0},
{"agent_id": "b", "endorsement_weight": 60.0},
{"agent_id": "c", "endorsement_weight": 40.0}
]
});
let result: StructuredBatchEvaluationResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.evaluations.len(), 3);
assert_eq!(result.evaluations[2].endorsement_weight, 40.0);
}
// =========================================================================
// deserialize_string_or_array_or_object — object input
// =========================================================================
#[test]
fn deser_string_or_array_nested_object_with_multiple_keys() {
let json = serde_json::json!({
"thought_process": {"step1": "analyze", "step2": "conclude"},
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
// Object should be serialized to JSON string
assert!(result.thought_process.contains("step1"));
assert!(result.thought_process.contains("analyze"));
}
#[test]
fn deser_string_or_array_number() {
let json = serde_json::json!({
"thought_process": 42,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "42");
}
#[test]
fn deser_string_or_array_bool() {
let json = serde_json::json!({
"thought_process": true,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "true");
}
#[test]
fn deser_string_or_array_null_becomes_string() {
let json = serde_json::json!({
"thought_process": null,
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
assert_eq!(result.thought_process, "null");
}
#[test]
fn deser_string_or_array_mixed_types_in_array() {
let json = serde_json::json!({
"thought_process": ["text", 123, true, null, {"k": "v"}],
"solution_content": "x"
});
let result: StructuredProposalResponse = serde_json::from_value(json).unwrap();
// Strings stay as-is, non-strings get serialized
assert!(result.thought_process.contains("text"));
assert!(result.thought_process.contains("123"));
assert!(result.thought_process.contains("true"));
}
// =========================================================================
// strip_scratchpad — nested and multiline edge cases
// =========================================================================
#[test]
fn strip_scratchpad_multiline_content() {
let input = "Start\n<scratchpad>\nLine 1\nLine 2\nLine 3\n</scratchpad>\nEnd";
let result = strip_scratchpad(input);
assert!(!result.contains("Line 1"));
assert!(!result.contains("Line 2"));
assert!(!result.contains("Line 3"));
assert!(result.contains("Start"));
assert!(result.contains("End"));
}
#[test]
fn strip_scratchpad_no_tags() {
let input = "Content without any scratchpad tags";
let result = strip_scratchpad(input);
assert_eq!(result, input);
}
// =========================================================================
// strip_working_memory — no tags
// =========================================================================
#[test]
fn strip_working_memory_no_tags() {
let input = "Content without any working_memory tags";
let result = strip_working_memory(input);
assert_eq!(result, input);
}
}