use anyhow::Result;
use async_trait::async_trait;
use chrono::Utc;
use futures::StreamExt;
use oxi_sdk::{Context, Message, Model, Provider, UserMessage};
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use crate::evaluation::EvaluationResult;
use crate::interview::InterviewResult;
use crate::protocol::{ExecutionResult, OuroborosProtocol, Phase};
use crate::seed::{AmbiguityScore, Entity, Seed};
#[derive(Debug, Deserialize)]
struct InterviewResponse {
#[serde(default = "default_true")]
is_task: bool,
#[serde(default)]
chat_response: String,
#[serde(default)]
questions: Vec<String>,
#[serde(default)]
structured_questions: Option<Vec<InterviewQuestionOutput>>,
scores: Option<AmbiguityScores>,
#[serde(default = "default_complexity")]
complexity: String,
}
fn default_complexity() -> String {
"complex".to_string()
}
fn default_true() -> bool {
true
}
#[derive(Debug, Deserialize)]
struct AmbiguityScores {
goal_clarity: f64,
constraint_clarity: f64,
success_criteria: f64,
}
#[derive(Debug, Deserialize)]
struct SeedResponse {
goal: String,
constraints: Vec<String>,
acceptance_criteria: Vec<String>,
#[serde(default)]
ontology: Vec<Entity>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterviewOptionOutput {
pub value: String,
pub label: String,
#[serde(default)]
pub description: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterviewQuestionOutput {
pub id: String,
pub text: String,
#[serde(default = "default_question_kind")]
pub kind: String,
#[serde(default)]
pub options: Vec<InterviewOptionOutput>,
}
fn default_question_kind() -> String {
"free_text".to_string()
}
#[derive(Debug, Deserialize)]
struct EvaluationResponse {
mechanical_pass: bool,
semantic_pass: bool,
score: f64,
notes: Vec<String>,
}
pub struct OuroborosEngine {
provider: Arc<dyn Provider>,
model: Model,
phase: parking_lot::Mutex<Phase>,
persona_prompt: parking_lot::Mutex<Option<String>>,
eval_cache: crate::eval_cache::EvalCache,
generation_history: parking_lot::RwLock<Vec<crate::regression::GenerationRecord>>,
}
impl OuroborosEngine {
pub fn new(provider: Arc<dyn Provider>, model: Model) -> Self {
Self {
provider,
model,
phase: parking_lot::Mutex::new(Phase::Interview),
persona_prompt: parking_lot::Mutex::new(None),
eval_cache: crate::eval_cache::EvalCache::new(256),
generation_history: parking_lot::RwLock::new(Vec::new()),
}
}
pub fn phase(&self) -> Phase {
*self.phase.lock()
}
fn set_phase(&self, phase: Phase) {
*self.phase.lock() = phase;
}
pub fn record_evaluation(&self, seed: Seed, evaluation: &EvaluationResult) {
let ac_results: Vec<bool> = evaluation
.notes
.iter()
.filter_map(|note| {
if note.starts_with("✓ ") {
Some(true)
} else if note.starts_with("✗ ") || note.starts_with("x ") {
Some(false)
} else {
None
}
})
.collect();
let record = crate::regression::GenerationRecord {
seed,
ac_results,
score: evaluation.score,
};
let mut history = self.generation_history.write();
history.push(record);
if history.len() > 10 {
history.remove(0);
}
}
fn detect_stagnation(&self) -> Option<crate::lateral::StagnationPattern> {
let history = self.generation_history.read();
if history.len() < 2 {
return None;
}
let scores: Vec<f64> = history.iter().map(|r| r.score).collect();
let latest = *scores.last()?;
let prev = scores[scores.len() - 2];
let drift = (latest - prev).abs();
let improvement = latest - prev;
if drift < 0.01 {
return Some(crate::lateral::StagnationPattern::NoDrift);
}
if scores.len() >= 3 {
let prev2 = scores[scores.len() - 3];
if (latest > prev && prev < prev2) || (latest < prev && prev > prev2) {
return Some(crate::lateral::StagnationPattern::Oscillation);
}
}
if history.len() >= 3 && improvement > 0.0 {
let improvements: Vec<f64> = scores.windows(2).map(|w| w[1] - w[0]).collect();
if improvements.len() >= 2 {
let last = improvements[improvements.len() - 1];
let prev_imp = improvements[improvements.len() - 2];
if 0.0 < last && last < prev_imp * 0.5 {
return Some(crate::lateral::StagnationPattern::DiminishingReturns);
}
}
}
None
}
fn detect_regressions(&self) -> Vec<crate::regression::Regression> {
let history = self.generation_history.read();
crate::regression::RegressionDetector::new()
.record_all(history.iter().cloned())
.detect()
}
#[allow(dead_code)]
fn set_persona_prompt(&self, prompt: Option<String>) {
*self.persona_prompt.lock() = prompt;
}
async fn llm_complete(&self, system_prompt: &str, user_message: &str) -> Result<String> {
let effective_system = if let Some(ref persona) = *self.persona_prompt.lock() {
format!("{persona}\n\n{system_prompt}")
} else {
system_prompt.to_string()
};
let mut ctx = Context::new();
ctx.set_system_prompt(effective_system);
ctx.add_message(Message::User(UserMessage::new(user_message)));
let stream = self.provider.stream(&self.model, &ctx, None).await?;
let mut text = String::new();
tokio::pin!(stream);
while let Some(event) = stream.next().await {
match event {
oxi_sdk::ProviderEvent::TextDelta { delta, .. } => {
text.push_str(&delta);
}
oxi_sdk::ProviderEvent::Done { .. } => break,
oxi_sdk::ProviderEvent::Error { error, .. } => {
let msg_text = error.text_content();
if !msg_text.is_empty() {
text = msg_text;
} else {
anyhow::bail!("LLM stream error");
}
break;
}
_ => {}
}
}
Ok(text)
}
fn parse_json<T: serde::de::DeserializeOwned>(raw: &str) -> Result<T> {
let trimmed = raw.trim();
let json_str = if trimmed.starts_with("```") {
let after_open = trimmed.find('\n').map(|i| i + 1).unwrap_or(0);
let before_close = trimmed.rfind("```").unwrap_or(trimmed.len());
&trimmed[after_open..before_close]
} else if let Some(start) = trimmed.find('{') {
if let Some(end) = trimmed.rfind('}') {
&trimmed[start..=end]
} else {
trimmed
}
} else if let Some(start) = trimmed.find('[') {
if let Some(end) = trimmed.rfind(']') {
&trimmed[start..=end]
} else {
trimmed
}
} else {
trimmed
};
Ok(serde_json::from_str(json_str.trim())?)
}
async fn llm_json<T: serde::de::DeserializeOwned>(
&self,
system_prompt: &str,
user_message: &str,
) -> Result<T> {
let raw = self.llm_complete(system_prompt, user_message).await?;
match Self::parse_json::<T>(&raw) {
Ok(parsed) => Ok(parsed),
Err(e) => {
tracing::warn!(error = %e, "JSON parse failed, retrying with correction");
let retry_msg = format!(
"Your previous response was invalid JSON. The error was: {}\n\n\
Your raw output was:\n```\n{}\n```\n\n\
Please respond with ONLY valid JSON matching the requested schema. \
Do not include any text before or after the JSON object.",
e,
&raw[..raw.len().min(500)]
);
let retry_raw = self.llm_complete(system_prompt, &retry_msg).await?;
Self::parse_json::<T>(&retry_raw)
.map_err(|e2| anyhow::anyhow!("JSON parse failed after retry: {e2}"))
}
}
}
pub async fn interview_structured(
&self,
user_input: &str,
) -> Result<Option<Vec<InterviewQuestionOutput>>> {
let system_prompt = INTERVIEW_SYSTEM_PROMPT;
let user_message = format!(
"The user said:\n\"{user_input}\"\n\n\
LANGUAGE: Write ALL text output (questions, chat_response, structured question labels and descriptions) in the SAME language as the user's message above.\n\n\
Analyze this message and produce a JSON object with:\n\
- \"is_task\": true if the message requests a concrete action (create, read, write, run, find, fix, analyze, deploy, etc.) or describes something to build/execute. false for greetings, small talk, questions, gratitude, opinions, or conversational messages.\n\
- \"chat_response\": (only when is_task=false) A natural, friendly response in the user's language. Be warm, concise, and helpful. Skip this field when is_task=true.\n\
- \"complexity\": (only when is_task=true) \"simple\" for clear single-action requests that need no clarification (check weather, set alarm, search, calculate, simple file read/write, echo). \"complex\" for ambiguous or multi-step tasks (modify code, write blog post, deploy, analyze). Default to \"complex\" when unsure.\n\
- \"questions\": (only when is_task=true) Up to 3 Socratic clarifying questions in the user's language. Empty array when is_task=false.\n\
- \"structured_questions\": (only when is_task=true) Parallel array matching `questions`. Each entry has {{ \"id\": \"q1\", \"text\": \"...\", \"kind\": \"single_choice\"|\"free_text\"|\"yes_no\", \"options\": [{{ \"value\": \"...\", \"label\": \"...\" }}] }}. All text fields MUST be in the user's language. Omit or set null when you cannot predict reasonable options. Skip when is_task=false.\n\
- \"scores\": (only when is_task=true) {{ \"goal_clarity\": 0.0-1.0, \"constraint_clarity\": 0.0-1.0, \"success_criteria\": 0.0-1.0 }}. Skip this field when is_task=false.\n\n\
IMPORTANT SCORING (when is_task=true):\n\
- Score GOAL_CLARITY 0.9+ ONLY if the request is immediately executable with no ambiguity\n\
- Score CONSTRAINT_CLARITY 0.8+ ONLY if specific filenames, paths, or content are provided\n\
- Score SUCCESS_CRITERIA 0.7+ ONLY if 'done' is clearly defined\n\
- Be HONEST with clarity scores. When in doubt, score LOWER."
);
let raw = self.llm_complete(system_prompt, &user_message).await?;
let parsed: InterviewResponse = match Self::parse_json(&raw) {
Ok(p) => p,
Err(e) => {
tracing::warn!(error = %e, "interview_structured: JSON parse failed");
return Ok(None);
}
};
if !parsed.is_task {
return Ok(None);
}
let plain_questions = &parsed.questions;
let structured = match parsed.structured_questions {
Some(s) if !s.is_empty() => s,
_ => return Ok(None),
};
let sanitized: Vec<InterviewQuestionOutput> = structured
.into_iter()
.filter_map(|mut q| {
if q.id.is_empty() || q.text.is_empty() {
return None;
}
match q.kind.as_str() {
"single_choice" | "multi_choice" | "yes_no" | "free_text" => {}
_ => {
q.kind = "free_text".to_string();
q.options.clear();
}
}
if matches!(q.kind.as_str(), "single_choice" | "multi_choice")
&& q.options.is_empty()
{
q.kind = "free_text".to_string();
}
q.options
.retain(|o| !o.value.is_empty() && !o.label.is_empty());
if q.kind == "yes_no" && q.options.is_empty() {
q.options = vec![
InterviewOptionOutput {
value: "yes".to_string(),
label: "Yes".to_string(),
description: String::new(),
},
InterviewOptionOutput {
value: "no".to_string(),
label: "No".to_string(),
description: String::new(),
},
];
}
Some(q)
})
.collect();
if sanitized.is_empty() {
return Ok(None);
}
let match_count = sanitized
.iter()
.filter(|q| plain_questions.iter().any(|p| p == &q.text))
.count();
tracing::info!(
structured = sanitized.len(),
plain = plain_questions.len(),
matched = match_count,
"interview_structured produced questions"
);
Ok(Some(sanitized))
}
}
#[async_trait]
impl OuroborosProtocol for OuroborosEngine {
fn set_persona_prompt(&self, prompt: Option<String>) {
*self.persona_prompt.lock() = prompt;
}
async fn interview_structured(
&self,
user_input: &str,
) -> Result<Option<Vec<InterviewQuestionOutput>>> {
OuroborosEngine::interview_structured(self, user_input).await
}
async fn interview(&self, user_input: &str) -> Result<InterviewResult> {
self.set_phase(Phase::Interview);
let system_prompt = INTERVIEW_SYSTEM_PROMPT;
let user_message = format!(
"The user said:\n\"{user_input}\"\n\n\
LANGUAGE: Write ALL text output (questions, chat_response) in the SAME language as the user's message above.\n\n\
Analyze this message and produce a JSON object with:\n\
- \"is_task\": true if the message requests a concrete action (create, read, write, run, find, fix, analyze, deploy, etc.) or describes something to build/execute. false for greetings, small talk, questions, gratitude, opinions, or conversational messages.\n\
- \"chat_response\": (only when is_task=false) A natural, friendly response in the user's language. Be warm, concise, and helpful. Skip this field when is_task=true.\n\
- \"complexity\": (only when is_task=true) \"simple\" for clear single-action requests that need no clarification (check weather, set alarm, search, calculate, simple file read/write, echo). \"complex\" for ambiguous or multi-step tasks (modify code, write blog post, deploy, analyze). Default to \"complex\" when unsure.\n\
- \"questions\": (only when is_task=true) Up to 3 Socratic clarifying questions in the user's language. Empty array when is_task=false.\n\
- \"scores\": (only when is_task=true) {{ \"goal_clarity\": 0.0-1.0, \"constraint_clarity\": 0.0-1.0, \"success_criteria\": 0.0-1.0 }}. Skip this field when is_task=false.\n\n\
IMPORTANT SCORING (when is_task=true):\n\
- Score GOAL_CLARITY 0.9+ ONLY if the request is immediately executable with no ambiguity\n\
- Score CONSTRAINT_CLARITY 0.8+ ONLY if specific filenames, paths, or content are provided\n\
- Score SUCCESS_CRITERIA 0.7+ ONLY if 'done' is clearly defined\n\
- Be HONEST with clarity scores. When in doubt, score LOWER."
);
let raw = self.llm_complete(system_prompt, &user_message).await?;
let parsed: InterviewResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
tracing::warn!(error = %e, "Failed to parse interview LLM response, using degraded fallback");
let degraded = crate::degraded::degraded_interview(user_input);
InterviewResponse {
is_task: degraded.is_task,
complexity: default_complexity(),
chat_response: degraded.chat_response,
questions: if !degraded.questions.is_empty() {
degraded.questions
} else {
vec!["Could you describe the goal in more detail?".into()]
},
structured_questions: None,
scores: Some(AmbiguityScores {
goal_clarity: 0.4,
constraint_clarity: 0.3,
success_criteria: 0.2,
}),
}
});
if !parsed.is_task {
let mut result = InterviewResult::new();
result.original_message = user_input.to_string();
result.is_task = false;
result.chat_response = if parsed.chat_response.is_empty() {
"Hello! How can I help you today?".to_string()
} else {
parsed.chat_response
};
result.ready_for_seed = false;
result.complexity = "n/a".to_string();
tracing::info!(is_task = false, "Interview phase complete (chat)");
return Ok(result);
}
let scores = parsed.scores.unwrap_or(AmbiguityScores {
goal_clarity: 0.5,
constraint_clarity: 0.5,
success_criteria: 0.5,
});
let ambiguity = AmbiguityScore::new(
scores.goal_clarity,
scores.constraint_clarity,
scores.success_criteria,
);
let ambiguity_value = ambiguity.ambiguity();
let mut result = InterviewResult::new();
result.original_message = user_input.to_string();
result.complexity = parsed.complexity.clone();
for q in &parsed.questions {
result.add_exchange(q, "");
}
result.update_ambiguity(ambiguity);
tracing::info!(
ambiguity = ambiguity_value,
ready = result.ready_for_seed,
complexity = %parsed.complexity,
questions = parsed.questions.len(),
"Interview phase complete (task)"
);
Ok(result)
}
async fn generate_seed(&self, interview: &InterviewResult) -> Result<Seed> {
self.set_phase(Phase::Seed);
let original_message = if interview.original_message.is_empty() {
interview.questions.first().cloned().unwrap_or_default()
} else {
interview.original_message.clone()
};
let has_answers = interview.answers.iter().any(|a| !a.is_empty());
let context_block = if has_answers {
let qa_block = interview
.questions
.iter()
.zip(interview.answers.iter())
.map(|(q, a)| format!("Q: {q}\nA: {a}"))
.collect::<Vec<_>>()
.join("\n\n");
format!("## Original Request\n{original_message}\n\n## Clarification Q&A\n{qa_block}")
} else {
format!("## Original Request\n{original_message}")
};
let system_prompt = SEED_SYSTEM_PROMPT;
let user_message = format!(
"{context_block}\n\n\
LANGUAGE: Write the goal and all text fields in the SAME language as the user's original request above.\n\n\
Generate a Seed specification that faithfully captures the user's ORIGINAL request.\n\
The goal MUST preserve exact details (filenames, content, paths, languages) from the request.\n\
Do NOT generalize or abstract — keep the specific details.\n\n\
Produce a JSON object with:\n\
- \"goal\": a single clear goal in the user's language that preserves ALL specifics from the original request\n\
- \"constraints\": list of constraints\n\
- \"acceptance_criteria\": list of measurable acceptance criteria that verify the specific details\n\
- \"ontology\": list of {{ \"name\": \"\", \"entity_type\": \"\", \"description\": \"\" }} domain entities"
);
let raw = self.llm_complete(system_prompt, &user_message).await?;
let parsed: SeedResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
tracing::warn!(error = %e, "Failed to parse seed LLM response, using degraded fallback");
let degraded = crate::degraded::degraded_seed(interview);
SeedResponse {
goal: degraded.goal,
constraints: degraded.constraints,
acceptance_criteria: degraded.acceptance_criteria,
ontology: degraded.ontology,
}
});
let seed = Seed {
id: uuid::Uuid::new_v4(),
goal: parsed.goal,
constraints: parsed.constraints,
acceptance_criteria: parsed.acceptance_criteria,
ontology: parsed.ontology,
created_at: Utc::now(),
generation: 0,
parent_seed_id: None,
cspace_hint: None,
original_request: interview.original_message.clone(),
output_schema: None,
project_id: None,
workspace_context: None,
mount_paths: Vec::new(),
};
tracing::info!(seed_id = %seed.id, goal = %seed.goal, "Seed generated");
Ok(seed)
}
async fn execute(&self, seed: &Seed) -> Result<ExecutionResult> {
self.set_phase(Phase::Execute);
tracing::info!(seed_id = %seed.id, "Execute phase (delegated to AgentRuntime via Supervisor)");
Ok(ExecutionResult {
output: format!("Execution of seed {} delegated to agent runtime", seed.id),
steps_completed: 0,
success: false, tool_calls: vec![],
tokens_input: 0,
tokens_output: 0,
model_id: String::new(),
})
}
async fn evaluate(&self, seed: &Seed, execution: &ExecutionResult) -> Result<EvaluationResult> {
self.set_phase(Phase::Evaluate);
if let Some(cached) = self.eval_cache.get(seed, execution) {
tracing::info!(seed_id = %seed.id, "Evaluation cache hit");
return Ok(cached);
}
let mechanical = crate::evaluation::MechanicalEvalResult::evaluate(
&seed.acceptance_criteria,
&execution.output,
);
if mechanical.all_passed {
let result = EvaluationResult {
mechanical_pass: true,
semantic_pass: None,
consensus_pass: None,
score: 1.0,
notes: mechanical
.criterion_results
.iter()
.map(|r| format!("✓ {}", r.criterion))
.collect(),
};
self.eval_cache.put(seed, execution, result.clone());
tracing::info!(seed_id = %seed.id, score = 1.0, "Mechanical evaluation passed, skipping LLM");
return Ok(result);
}
let mechanical_notes: String = mechanical
.criterion_results
.iter()
.map(|r| format!("- {}: {} ({})", r.criterion, r.passed, r.reason))
.collect::<Vec<_>>()
.join("\n");
let system_prompt = EVALUATE_SYSTEM_PROMPT;
let user_message = format!(
"## Goal\n{}\n\n## Acceptance Criteria\n{}\n\n\
## Mechanical Check Results\n{}\n\n\
## Execution Output (first 3000 chars)\n{}\n\n\
Evaluate whether the execution output satisfies the goal and acceptance criteria.\n\
Produce a JSON object:\n\
- \"mechanical_pass\": {}\n\
- \"semantic_pass\": true/false\n\
- \"score\": 0.0 to 1.0\n\
- \"notes\": list of evaluation notes",
seed.goal,
seed.acceptance_criteria
.iter()
.enumerate()
.map(|(i, c)| format!("{}. {}", i + 1, c))
.collect::<Vec<_>>()
.join("\n"),
mechanical_notes,
&execution.output[..execution.output.len().min(3000)],
mechanical.all_passed,
);
let result = match self
.llm_json::<EvaluationResponse>(system_prompt, &user_message)
.await
{
Ok(parsed) => {
let r = EvaluationResult {
mechanical_pass: parsed.mechanical_pass,
semantic_pass: Some(parsed.semantic_pass),
consensus_pass: None,
score: parsed.score,
notes: parsed.notes,
};
self.eval_cache.put(seed, execution, r.clone());
r
}
Err(e) => {
tracing::warn!(error = %e, "Evaluation JSON parse failed after retry, using degraded fallback");
crate::degraded::degraded_evaluation(seed, &execution.output, mechanical.all_passed)
}
};
tracing::info!(
seed_id = %seed.id,
mechanical = result.mechanical_pass,
semantic = ?result.semantic_pass,
score = result.score,
"Evaluation complete"
);
Ok(result)
}
async fn evolve(&self, seed: &Seed, evaluation: &EvaluationResult) -> Result<Option<Seed>> {
self.set_phase(Phase::Evolve);
if evaluation.all_passed() && evaluation.score >= 0.8 {
tracing::info!(seed_id = %seed.id, "Evaluation passed, no evolution needed");
return Ok(None);
}
self.record_evaluation(seed.clone(), evaluation);
let base_context = format!(
"## Original Seed\n\
Goal: {}\n\
Constraints: {}\n\
Acceptance Criteria: {}\n\n\
## Evaluation Result\n\
Mechanical pass: {}\n\
Semantic pass: {}\n\
Score: {}\n\
Notes: {}",
seed.goal,
seed.constraints.join(", "),
seed.acceptance_criteria
.iter()
.enumerate()
.map(|(i, c)| format!("{}. {}", i + 1, c))
.collect::<Vec<_>>()
.join("\n"),
evaluation.mechanical_pass,
evaluation
.semantic_pass
.map(|p| p.to_string())
.unwrap_or_else(|| "not evaluated".into()),
evaluation.score,
evaluation.notes.join("; "),
);
let mut context_blocks = vec![base_context];
if seed.generation >= 2 {
if let Some(pattern) = self.detect_stagnation() {
tracing::info!(seed_id = %seed.id, pattern = ?pattern, "Stagnation detected, applying lateral thinking");
let tried: Vec<String> = {
let history = self.generation_history.read();
history
.iter()
.filter_map(|r| r.seed.cspace_hint.as_deref())
.filter(|h| h.starts_with("lateral:"))
.map(|h| h[8..].to_string())
.collect()
};
let tried_refs: Vec<&str> = tried.iter().map(|s| s.as_str()).collect();
if let Some(persona) = crate::lateral::select_persona(pattern, &tried_refs) {
let lateral = crate::lateral::build_lateral_prompt(
persona,
&seed.goal,
&format!(
"Score={:.2}, passed={}",
evaluation.score,
evaluation.all_passed()
),
&evaluation.notes,
);
context_blocks.push(lateral);
let mut guard = self.generation_history.write();
if let Some(last) = guard.last_mut() {
last.seed.cspace_hint = Some(format!("lateral:{}", persona.name));
}
}
}
let regressions = self.detect_regressions();
if !regressions.is_empty() {
let reg_text =
crate::regression::RegressionDetector::format_for_prompt(®ressions);
context_blocks.push(reg_text);
tracing::info!(
seed_id = %seed.id,
count = regressions.len(),
"Injecting regression context"
);
}
}
let user_message = context_blocks.join("\n\n---\n\n");
let system_prompt = EVOLVE_SYSTEM_PROMPT;
let raw = self.llm_complete(system_prompt, &user_message).await?;
let parsed: SeedResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
tracing::warn!(error = %e, "Failed to parse evolve LLM response");
SeedResponse {
goal: seed.goal.clone(),
constraints: seed.constraints.clone(),
acceptance_criteria: seed.acceptance_criteria.clone(),
ontology: seed.ontology.clone(),
}
});
let evolved = Seed::evolved_from(seed);
let new_seed = Seed {
id: evolved.id,
goal: parsed.goal,
constraints: parsed.constraints,
acceptance_criteria: parsed.acceptance_criteria,
ontology: parsed.ontology,
created_at: Utc::now(),
generation: evolved.generation,
parent_seed_id: evolved.parent_seed_id,
cspace_hint: evolved.cspace_hint,
original_request: seed.original_request.clone(),
output_schema: None,
project_id: seed.project_id,
workspace_context: seed.workspace_context.clone(),
mount_paths: seed.mount_paths.clone(),
};
tracing::info!(
original_seed = %seed.id,
evolved_seed = %new_seed.id,
generation = new_seed.generation,
"Seed evolved"
);
Ok(Some(new_seed))
}
}
const INTERVIEW_SYSTEM_PROMPT: &str = "\
You are the Interview phase of the Ouroboros protocol. \
Your job: determine whether the user's message is a task or conversation, \
and if it's a task, assess ambiguity along three dimensions.
## Language Fidelity (CRITICAL)
You MUST match the language of the user's message in ALL output text.
- Whatever language the user uses, you use that SAME language. No exceptions.
- This applies to: questions, chat_response, structured_questions labels/descriptions.
- Never translate, paraphrase, or switch to a different language regardless of context length or turn number.
## Critical Boundaries
- NEVER propose solutions. You ask, you do not implement.
- NEVER say \"I will...\" or \"Let me...\" — you are an interviewer, not an executor.
- NEVER skip scoring. Every task gets ambiguity scores.
## Scoring Policy
Be HONEST, not generous:
- Score GOAL_CLARITY below 0.5 if the user's intent is genuinely ambiguous
- Score CONSTRAINT_CLARITY below 0.5 if no specifics are mentioned
- Score SUCCESS_CRITERIA below 0.5 if \"done\" is undefined
- Reserve 0.9+ for requests that are immediately executable as-is
- When in doubt, score LOWER — it is cheaper to ask than to guess wrong
## Conversation Detection
- Greetings, thanks, opinions, questions about capabilities → is_task: false
- Any verb implying action (create, fix, find, deploy, analyze, review) → is_task: true
- When uncertain, default to is_task: true
## Question Quality
Bad: \"Could you tell me more about your requirements?\"
Good: \"You said 'optimize the API' — optimize for latency, throughput, or cost?\"
Questions must target a SPECIFIC ambiguity, not invite a general brain dump.
Maximum 3 questions. Each must be answerable in one sentence.
Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";
const SEED_SYSTEM_PROMPT: &str = "\
You are the Seed Architect of the Ouroboros protocol. \
Your job: crystallize interview results into an immutable specification.
## Core Principle
A Seed is a CONTRACT — it will be executed by an autonomous agent without \
further human input. If the Seed is ambiguous, the execution WILL go wrong.
## Mandatory Properties
- COMPLETE: Contains EVERYTHING the agent needs. No assumed context.
- SPECIFIC: Exact filenames, paths, languages, frameworks — never \"a file\" \
or \"the system\".
- MEASURABLE: Each acceptance criterion can be verified by running a command \
or checking file content. No subjective criteria like \"clean code\".
## Scope Guard
Do NOT expand beyond the user's request:
- If they asked for a single function, do not spec a module
- If they specified a language, do not suggest alternatives
- If they named a file, use THAT filename, not a \"better\" one
If the interview was insufficient to produce a complete Seed, include the \
constraint: \"Requires human clarification: [what's missing]\"
Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";
const EVALUATE_SYSTEM_PROMPT: &str = "\
You are the Evaluator of the Ouroboros protocol. \
Your job: determine whether execution output satisfies the Seed specification.
## Two-Stage Evaluation
Stage 1 — Mechanical: Does the output explicitly address each acceptance criterion?
- If the agent claims to have created a file, look for the file content or path
- If the agent claims to have run a command, look for command output
- Absence of evidence = evidence of absence
Stage 2 — Semantic: Does the output actually solve the user's intent?
- The agent may check every box but still miss the point
- Look for the gap between \"technically correct\" and \"genuinely useful\"
## Scoring Policy
- 0.9–1.0: All criteria met, output is complete and correct
- 0.7–0.8: Core goal achieved, minor issues or missing optional elements
- 0.5–0.6: Partially done, significant gaps
- Below 0.5: Fundamentally failed or produced nothing useful
## Anti-Patterns (score penalty)
- Agent claims completion without showing evidence → cap at 0.5
- Agent solved a different problem than specified → cap at 0.4
- Agent made changes not in the Seed scope → flag as scope violation
- Agent output is generic/boilerplate that could apply to anything → cap at 0.3
## Evidence Requirement
Do NOT give credit for claims. Give credit for DEMONSTRATED results:
- \"I created the file\" → Show me the file content
- \"Tests pass\" → Show me the test output
- \"The bug is fixed\" → Show me before/after behavior
Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";
const EVOLVE_SYSTEM_PROMPT: &str = "\
You are the Evolve phase of the Ouroboros protocol. \
Your job: improve a Seed based on evaluation failure analysis.
## Before You Evolve
1. Read the evaluation notes carefully — WHAT failed and WHY
2. Distinguish between:
- SPEC issues (Seed was ambiguous or incomplete) → Fix the Seed
- EXECUTION issues (Agent misunderstood or went off-track) → Add constraints/guards
- IMPOSSIBLE issues (Goal is infeasible as stated) → Flag for human review
## Evolution Rules
- Preserve what WORKED — do not change passing acceptance criteria
- Add constraints that prevent known failure modes
- Tighten criteria that were too vague
- If the goal itself was wrong, flag it rather than silently changing it
## Scope Guard
Evolution narrows scope, never expands it:
- Do NOT add new features the user didn't request
- Do NOT change the goal to something \"better\"
- Do NOT add acceptance criteria for problems the user didn't mention
## Stagnation Detection
If this is generation 3+ and the same issues persist:
- The Seed may be fundamentally flawed — suggest restarting the interview
- Consider whether the task needs to be decomposed into smaller Seeds
Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";
impl std::fmt::Debug for OuroborosEngine {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OuroborosEngine")
.field("phase", &self.phase())
.field("model", &self.model.id)
.finish()
}
}