oxios-ouroboros 1.5.1

Ouroboros spec-first protocol for Oxios
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
//! Ouroboros engine: LLM-backed implementation of the Ouroboros protocol.
//!
//! Uses an oxi-ai Provider to drive the five-phase lifecycle:
//! interview → seed → execute → evaluate → evolve.
//!
//! The interview and generate_seed phases use LLM calls to produce
//! Socratic questions and crystallize answers into structured Seeds.
//! The execute phase is delegated to an external executor (AgentRuntime).
//! The evaluate phase uses three-stage verification.

use anyhow::Result;
use async_trait::async_trait;
use chrono::Utc;
use futures::StreamExt;
use oxi_sdk::{Context, Message, Model, Provider, UserMessage};
use serde::{Deserialize, Serialize};
use std::sync::Arc;

use crate::evaluation::EvaluationResult;
use crate::interview::InterviewResult;
use crate::protocol::{ExecutionResult, OuroborosProtocol, Phase};
use crate::seed::{AmbiguityScore, Entity, Seed};

// ---------------------------------------------------------------------------
// JSON shapes we parse from LLM responses
// ---------------------------------------------------------------------------

/// Expected LLM response shape for the interview phase.
#[derive(Debug, Deserialize)]
struct InterviewResponse {
    /// Whether this message requires task execution (tools, files, etc.)
    /// or is just conversational (greetings, questions, small talk).
    #[serde(default = "default_true")]
    is_task: bool,
    /// Direct conversational response (only used when is_task = false).
    #[serde(default)]
    chat_response: String,
    /// Socratic questions to ask the user (only used when is_task = true).
    #[serde(default)]
    questions: Vec<String>,
    /// Structured form of the same questions with options/kind for the
    /// interactive Web UI. Optional — when the LLM omits this field or
    /// the JSON is malformed, the frontend falls back to plain markdown
    /// rendering. The plain `questions` array is always present and
    /// remains the source of truth for the Orchestrator.
    #[serde(default)]
    structured_questions: Option<Vec<InterviewQuestionOutput>>,
    /// Ambiguity scores along each dimension (0.0–1.0 clarity).
    scores: Option<AmbiguityScores>,
    /// Task complexity: "simple" for clear single-action requests,
    /// "complex" for ambiguous or multi-step tasks.
    /// Defaults to "complex" (safe default — extra LLM call is cheaper
    /// than misrouting an ambiguous request).
    #[serde(default = "default_complexity")]
    complexity: String,
}

fn default_complexity() -> String {
    "complex".to_string()
}

fn default_true() -> bool {
    true
}

/// Ambiguity sub-scores from the LLM.
#[derive(Debug, Deserialize)]
struct AmbiguityScores {
    goal_clarity: f64,
    constraint_clarity: f64,
    success_criteria: f64,
}

/// Expected LLM response shape for the seed generation phase.
#[derive(Debug, Deserialize)]
struct SeedResponse {
    goal: String,
    constraints: Vec<String>,
    acceptance_criteria: Vec<String>,
    #[serde(default)]
    ontology: Vec<Entity>,
}

// ---------------------------------------------------------------------------
// Structured interview questions (chat UI redesign — interactive interview)
//
// The interview LLM is asked to produce a parallel `structured_questions`
// array alongside the plain `questions` strings. The frontend renders
// structured questions as interactive UI (chips, yes/no buttons, etc.).
// When the LLM omits `structured_questions` or returns malformed JSON,
// the frontend falls back to the plain markdown response — graceful
// degradation. The plain `questions: Vec<String>` field is kept
// untouched so the Orchestrator's existing logic continues to work.
// ---------------------------------------------------------------------------

/// Single option for a structured interview question.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterviewOptionOutput {
    /// Stable identifier for the answer payload (e.g. "points", "ko").
    pub value: String,
    /// Human-readable label rendered as a chip/button.
    pub label: String,
    /// Optional longer description shown as a tooltip.
    #[serde(default)]
    pub description: String,
}

/// One structured question produced by the LLM.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterviewQuestionOutput {
    /// Short identifier used as the answer key (e.g. "q1", "q2").
    pub id: String,
    /// The question text (also present in the parallel `questions` array).
    pub text: String,
    /// Question kind — drives the frontend widget selection.
    #[serde(default = "default_question_kind")]
    pub kind: String,
    /// Choice options (empty for free_text / yes_no).
    #[serde(default)]
    pub options: Vec<InterviewOptionOutput>,
}

fn default_question_kind() -> String {
    "free_text".to_string()
}

/// Expected LLM response shape for the evaluation phase.
#[derive(Debug, Deserialize)]
struct EvaluationResponse {
    mechanical_pass: bool,
    semantic_pass: bool,
    score: f64,
    notes: Vec<String>,
}

// ---------------------------------------------------------------------------
// Engine
// ---------------------------------------------------------------------------

/// LLM-powered implementation of the Ouroboros protocol.
///
/// The engine uses the injected `oxi_sdk::Provider` to make LLM calls
/// for interview, seed generation, evaluation, and evolution.
pub struct OuroborosEngine {
    provider: Arc<dyn Provider>,
    model: Model,
    phase: parking_lot::Mutex<Phase>,
    /// Optional persona system prompt, prepended to every LLM call.
    persona_prompt: parking_lot::Mutex<Option<String>>,
    /// Evaluation cache for avoiding redundant LLM calls.
    eval_cache: crate::eval_cache::EvalCache,
    /// Generation history for stagnation + regression detection.
    /// Kept across evolve() calls within one Orchestrator session.
    generation_history: parking_lot::RwLock<Vec<crate::regression::GenerationRecord>>,
}

impl OuroborosEngine {
    /// Create a new engine with the given provider and LLM model.
    pub fn new(provider: Arc<dyn Provider>, model: Model) -> Self {
        Self {
            provider,
            model,
            phase: parking_lot::Mutex::new(Phase::Interview),
            persona_prompt: parking_lot::Mutex::new(None),
            eval_cache: crate::eval_cache::EvalCache::new(256),
            generation_history: parking_lot::RwLock::new(Vec::new()),
        }
    }

    /// Returns the current phase.
    pub fn phase(&self) -> Phase {
        *self.phase.lock()
    }

    /// Set the current phase.
    fn set_phase(&self, phase: Phase) {
        *self.phase.lock() = phase;
    }

    /// Record an evaluation result into generation history for
    /// stagnation and regression detection.
    pub fn record_evaluation(&self, seed: Seed, evaluation: &EvaluationResult) {
        let ac_results: Vec<bool> = evaluation
            .notes
            .iter()
            .filter_map(|note| {
                if note.starts_with("") {
                    Some(true)
                } else if note.starts_with("") || note.starts_with("x ") {
                    Some(false)
                } else {
                    None
                }
            })
            .collect();

        let record = crate::regression::GenerationRecord {
            seed,
            ac_results,
            score: evaluation.score,
        };

        let mut history = self.generation_history.write();
        history.push(record);
        if history.len() > 10 {
            history.remove(0);
        }
    }

    /// Detect stagnation across the current generation history.
    fn detect_stagnation(&self) -> Option<crate::lateral::StagnationPattern> {
        let history = self.generation_history.read();
        if history.len() < 2 {
            return None;
        }

        let scores: Vec<f64> = history.iter().map(|r| r.score).collect();
        let latest = *scores.last()?;
        let prev = scores[scores.len() - 2];

        let drift = (latest - prev).abs();
        let improvement = latest - prev;

        // No drift at all.
        if drift < 0.01 {
            return Some(crate::lateral::StagnationPattern::NoDrift);
        }

        // Oscillation: last two iterations went opposite directions.
        if scores.len() >= 3 {
            let prev2 = scores[scores.len() - 3];
            if (latest > prev && prev < prev2) || (latest < prev && prev > prev2) {
                return Some(crate::lateral::StagnationPattern::Oscillation);
            }
        }

        // Diminishing returns: each improvement is smaller than the last.
        if history.len() >= 3 && improvement > 0.0 {
            let improvements: Vec<f64> = scores.windows(2).map(|w| w[1] - w[0]).collect();
            if improvements.len() >= 2 {
                let last = improvements[improvements.len() - 1];
                let prev_imp = improvements[improvements.len() - 2];
                if 0.0 < last && last < prev_imp * 0.5 {
                    return Some(crate::lateral::StagnationPattern::DiminishingReturns);
                }
            }
        }

        None
    }

    /// Detect which acceptance criteria regressed across generations.
    fn detect_regressions(&self) -> Vec<crate::regression::Regression> {
        let history = self.generation_history.read();
        crate::regression::RegressionDetector::new()
            .record_all(history.iter().cloned())
            .detect()
    }

    /// Set or clear the persona system prompt.
    #[allow(dead_code)]
    fn set_persona_prompt(&self, prompt: Option<String>) {
        *self.persona_prompt.lock() = prompt;
    }

    /// Run a non-tool LLM completion and return the text content.
    async fn llm_complete(&self, system_prompt: &str, user_message: &str) -> Result<String> {
        // Prepend persona prompt if set.
        let effective_system = if let Some(ref persona) = *self.persona_prompt.lock() {
            format!("{persona}\n\n{system_prompt}")
        } else {
            system_prompt.to_string()
        };

        let mut ctx = Context::new();
        ctx.set_system_prompt(effective_system);
        ctx.add_message(Message::User(UserMessage::new(user_message)));

        let stream = self.provider.stream(&self.model, &ctx, None).await?;

        // Collect the stream into a single text string.
        let mut text = String::new();
        tokio::pin!(stream);
        while let Some(event) = stream.next().await {
            match event {
                oxi_sdk::ProviderEvent::TextDelta { delta, .. } => {
                    text.push_str(&delta);
                }
                oxi_sdk::ProviderEvent::Done { .. } => break,
                oxi_sdk::ProviderEvent::Error { error, .. } => {
                    // Try to extract text from the error message.
                    let msg_text = error.text_content();
                    if !msg_text.is_empty() {
                        text = msg_text;
                    } else {
                        anyhow::bail!("LLM stream error");
                    }
                    break;
                }
                _ => {}
            }
        }

        Ok(text)
    }

    /// Parse JSON from LLM output, handling markdown fences and prose wrapping.
    fn parse_json<T: serde::de::DeserializeOwned>(raw: &str) -> Result<T> {
        let trimmed = raw.trim();
        let json_str = if trimmed.starts_with("```") {
            let after_open = trimmed.find('\n').map(|i| i + 1).unwrap_or(0);
            let before_close = trimmed.rfind("```").unwrap_or(trimmed.len());
            &trimmed[after_open..before_close]
        } else if let Some(start) = trimmed.find('{') {
            if let Some(end) = trimmed.rfind('}') {
                &trimmed[start..=end]
            } else {
                trimmed
            }
        } else if let Some(start) = trimmed.find('[') {
            if let Some(end) = trimmed.rfind(']') {
                &trimmed[start..=end]
            } else {
                trimmed
            }
        } else {
            trimmed
        };
        Ok(serde_json::from_str(json_str.trim())?)
    }

    /// Run LLM completion, parse as JSON, retry once on failure.
    async fn llm_json<T: serde::de::DeserializeOwned>(
        &self,
        system_prompt: &str,
        user_message: &str,
    ) -> Result<T> {
        let raw = self.llm_complete(system_prompt, user_message).await?;
        match Self::parse_json::<T>(&raw) {
            Ok(parsed) => Ok(parsed),
            Err(e) => {
                tracing::warn!(error = %e, "JSON parse failed, retrying with correction");
                let retry_msg = format!(
                    "Your previous response was invalid JSON. The error was: {}\n\n\
                     Your raw output was:\n```\n{}\n```\n\n\
                     Please respond with ONLY valid JSON matching the requested schema. \
                     Do not include any text before or after the JSON object.",
                    e,
                    &raw[..raw.len().min(500)]
                );
                let retry_raw = self.llm_complete(system_prompt, &retry_msg).await?;
                Self::parse_json::<T>(&retry_raw)
                    .map_err(|e2| anyhow::anyhow!("JSON parse failed after retry: {e2}"))
            }
        }
    }

    /// Returns structured question output for a given interview, if the
    /// LLM produced one. Called by the Orchestrator right after
    /// `interview()` for the same `user_input`. The two calls are
    /// independent (no shared LLM call) — this keeps the trait stable
    /// and avoids changing the InterviewResult shape that the existing
    /// Orchestrator and persistence code depend on.
    ///
    /// Returns `Ok(None)` when:
    /// - The LLM's `structured_questions` field was absent/null.
    /// - JSON parsing failed for the structured sub-field.
    /// - The interview was a non-task (chat) response.
    pub async fn interview_structured(
        &self,
        user_input: &str,
    ) -> Result<Option<Vec<InterviewQuestionOutput>>> {
        let system_prompt = INTERVIEW_SYSTEM_PROMPT;
        let user_message = format!(
            "The user said:\n\"{user_input}\"\n\n\
             LANGUAGE: Write ALL text output (questions, chat_response, structured question labels and descriptions) in the SAME language as the user's message above.\n\n\
             Analyze this message and produce a JSON object with:\n\
             - \"is_task\": true if the message requests a concrete action (create, read, write, run, find, fix, analyze, deploy, etc.) or describes something to build/execute. false for greetings, small talk, questions, gratitude, opinions, or conversational messages.\n\
             - \"chat_response\": (only when is_task=false) A natural, friendly response in the user's language. Be warm, concise, and helpful. Skip this field when is_task=true.\n\
             - \"complexity\": (only when is_task=true) \"simple\" for clear single-action requests that need no clarification (check weather, set alarm, search, calculate, simple file read/write, echo). \"complex\" for ambiguous or multi-step tasks (modify code, write blog post, deploy, analyze). Default to \"complex\" when unsure.\n\
             - \"questions\": (only when is_task=true) Up to 3 Socratic clarifying questions in the user's language. Empty array when is_task=false.\n\
             - \"structured_questions\": (only when is_task=true) Parallel array matching `questions`. Each entry has {{ \"id\": \"q1\", \"text\": \"...\", \"kind\": \"single_choice\"|\"free_text\"|\"yes_no\", \"options\": [{{ \"value\": \"...\", \"label\": \"...\" }}] }}. All text fields MUST be in the user's language. Omit or set null when you cannot predict reasonable options. Skip when is_task=false.\n\
             - \"scores\": (only when is_task=true) {{ \"goal_clarity\": 0.0-1.0, \"constraint_clarity\": 0.0-1.0, \"success_criteria\": 0.0-1.0 }}. Skip this field when is_task=false.\n\n\
             IMPORTANT SCORING (when is_task=true):\n\
             - Score GOAL_CLARITY 0.9+ ONLY if the request is immediately executable with no ambiguity\n\
             - Score CONSTRAINT_CLARITY 0.8+ ONLY if specific filenames, paths, or content are provided\n\
             - Score SUCCESS_CRITERIA 0.7+ ONLY if 'done' is clearly defined\n\
             - Be HONEST with clarity scores. When in doubt, score LOWER."
        );

        let raw = self.llm_complete(system_prompt, &user_message).await?;
        let parsed: InterviewResponse = match Self::parse_json(&raw) {
            Ok(p) => p,
            Err(e) => {
                tracing::warn!(error = %e, "interview_structured: JSON parse failed");
                return Ok(None);
            }
        };

        if !parsed.is_task {
            return Ok(None);
        }

        // Validate structured_questions: drop entries with empty id/text,
        // and ensure single_choice / multi_choice / yes_no have valid
        // options. Empty/missing options for choice kinds cause the
        // question to be downgraded to free_text (forgiving — better
        // than silently dropping the question).
        let plain_questions = &parsed.questions;
        let structured = match parsed.structured_questions {
            Some(s) if !s.is_empty() => s,
            _ => return Ok(None),
        };

        let sanitized: Vec<InterviewQuestionOutput> = structured
            .into_iter()
            .filter_map(|mut q| {
                if q.id.is_empty() || q.text.is_empty() {
                    return None;
                }
                match q.kind.as_str() {
                    "single_choice" | "multi_choice" | "yes_no" | "free_text" => {}
                    _ => {
                        q.kind = "free_text".to_string();
                        q.options.clear();
                    }
                }
                if matches!(q.kind.as_str(), "single_choice" | "multi_choice")
                    && q.options.is_empty()
                {
                    q.kind = "free_text".to_string();
                }
                q.options
                    .retain(|o| !o.value.is_empty() && !o.label.is_empty());
                if q.kind == "yes_no" && q.options.is_empty() {
                    q.options = vec![
                        InterviewOptionOutput {
                            value: "yes".to_string(),
                            label: "Yes".to_string(),
                            description: String::new(),
                        },
                        InterviewOptionOutput {
                            value: "no".to_string(),
                            label: "No".to_string(),
                            description: String::new(),
                        },
                    ];
                }
                Some(q)
            })
            .collect();

        if sanitized.is_empty() {
            return Ok(None);
        }

        // Sanity log: how many plain questions matched structured ones.
        let match_count = sanitized
            .iter()
            .filter(|q| plain_questions.iter().any(|p| p == &q.text))
            .count();
        tracing::info!(
            structured = sanitized.len(),
            plain = plain_questions.len(),
            matched = match_count,
            "interview_structured produced questions"
        );

        Ok(Some(sanitized))
    }
}

#[async_trait]
impl OuroborosProtocol for OuroborosEngine {
    fn set_persona_prompt(&self, prompt: Option<String>) {
        *self.persona_prompt.lock() = prompt;
    }

    async fn interview_structured(
        &self,
        user_input: &str,
    ) -> Result<Option<Vec<InterviewQuestionOutput>>> {
        // Delegate to the inherent method on `OuroborosEngine` to keep
        // the long implementation body in one place. Inherent methods
        // are not visible through `Arc<dyn OuroborosProtocol>`, so
        // callers (the Orchestrator) get to this point through the
        // trait method.
        OuroborosEngine::interview_structured(self, user_input).await
    }

    async fn interview(&self, user_input: &str) -> Result<InterviewResult> {
        self.set_phase(Phase::Interview);

        let system_prompt = INTERVIEW_SYSTEM_PROMPT;
        let user_message = format!(
            "The user said:\n\"{user_input}\"\n\n\
             LANGUAGE: Write ALL text output (questions, chat_response) in the SAME language as the user's message above.\n\n\
             Analyze this message and produce a JSON object with:\n\
             - \"is_task\": true if the message requests a concrete action (create, read, write, run, find, fix, analyze, deploy, etc.) or describes something to build/execute. false for greetings, small talk, questions, gratitude, opinions, or conversational messages.\n\
             - \"chat_response\": (only when is_task=false) A natural, friendly response in the user's language. Be warm, concise, and helpful. Skip this field when is_task=true.\n\
             - \"complexity\": (only when is_task=true) \"simple\" for clear single-action requests that need no clarification (check weather, set alarm, search, calculate, simple file read/write, echo). \"complex\" for ambiguous or multi-step tasks (modify code, write blog post, deploy, analyze). Default to \"complex\" when unsure.\n\
             - \"questions\": (only when is_task=true) Up to 3 Socratic clarifying questions in the user's language. Empty array when is_task=false.\n\
             - \"scores\": (only when is_task=true) {{ \"goal_clarity\": 0.0-1.0, \"constraint_clarity\": 0.0-1.0, \"success_criteria\": 0.0-1.0 }}. Skip this field when is_task=false.\n\n\
             IMPORTANT SCORING (when is_task=true):\n\
             - Score GOAL_CLARITY 0.9+ ONLY if the request is immediately executable with no ambiguity\n\
             - Score CONSTRAINT_CLARITY 0.8+ ONLY if specific filenames, paths, or content are provided\n\
             - Score SUCCESS_CRITERIA 0.7+ ONLY if 'done' is clearly defined\n\
             - Be HONEST with clarity scores. When in doubt, score LOWER."
        );

        let raw = self.llm_complete(system_prompt, &user_message).await?;
        let parsed: InterviewResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
            tracing::warn!(error = %e, "Failed to parse interview LLM response, using degraded fallback");
            // Use context-aware fallback instead of generic defaults
            let degraded = crate::degraded::degraded_interview(user_input);
            InterviewResponse {
                is_task: degraded.is_task,
                complexity: default_complexity(),
                chat_response: degraded.chat_response,
                questions: if !degraded.questions.is_empty() {
                    degraded.questions
                } else {
                    vec!["Could you describe the goal in more detail?".into()]
                },
                // Degraded fallback never produces structured questions;
                // the frontend renders the plain markdown response.
                structured_questions: None,
                scores: Some(AmbiguityScores {
                    goal_clarity: 0.4,
                    constraint_clarity: 0.3,
                    success_criteria: 0.2,
                }),
            }
        });

        // Non-task message — return direct chat response
        if !parsed.is_task {
            let mut result = InterviewResult::new();
            result.original_message = user_input.to_string();
            result.is_task = false;
            result.chat_response = if parsed.chat_response.is_empty() {
                "Hello! How can I help you today?".to_string()
            } else {
                parsed.chat_response
            };
            result.ready_for_seed = false;
            result.complexity = "n/a".to_string(); // chat, not a task

            tracing::info!(is_task = false, "Interview phase complete (chat)");

            return Ok(result);
        }

        // Task message — evaluate ambiguity
        let scores = parsed.scores.unwrap_or(AmbiguityScores {
            goal_clarity: 0.5,
            constraint_clarity: 0.5,
            success_criteria: 0.5,
        });

        let ambiguity = AmbiguityScore::new(
            scores.goal_clarity,
            scores.constraint_clarity,
            scores.success_criteria,
        );

        let ambiguity_value = ambiguity.ambiguity();

        let mut result = InterviewResult::new();
        result.original_message = user_input.to_string();
        result.complexity = parsed.complexity.clone();
        for q in &parsed.questions {
            result.add_exchange(q, "");
        }
        result.update_ambiguity(ambiguity);

        tracing::info!(
            ambiguity = ambiguity_value,
            ready = result.ready_for_seed,
            complexity = %parsed.complexity,
            questions = parsed.questions.len(),
            "Interview phase complete (task)"
        );

        Ok(result)
    }

    async fn generate_seed(&self, interview: &InterviewResult) -> Result<Seed> {
        self.set_phase(Phase::Seed);

        // Build context: combine original user message with any Q&A
        let original_message = if interview.original_message.is_empty() {
            interview.questions.first().cloned().unwrap_or_default()
        } else {
            interview.original_message.clone()
        };
        let has_answers = interview.answers.iter().any(|a| !a.is_empty());

        let context_block = if has_answers {
            let qa_block = interview
                .questions
                .iter()
                .zip(interview.answers.iter())
                .map(|(q, a)| format!("Q: {q}\nA: {a}"))
                .collect::<Vec<_>>()
                .join("\n\n");
            format!("## Original Request\n{original_message}\n\n## Clarification Q&A\n{qa_block}")
        } else {
            format!("## Original Request\n{original_message}")
        };

        let system_prompt = SEED_SYSTEM_PROMPT;
        let user_message = format!(
            "{context_block}\n\n\
             LANGUAGE: Write the goal and all text fields in the SAME language as the user's original request above.\n\n\
             Generate a Seed specification that faithfully captures the user's ORIGINAL request.\n\
             The goal MUST preserve exact details (filenames, content, paths, languages) from the request.\n\
             Do NOT generalize or abstract — keep the specific details.\n\n\
             Produce a JSON object with:\n\
             - \"goal\": a single clear goal in the user's language that preserves ALL specifics from the original request\n\
             - \"constraints\": list of constraints\n\
             - \"acceptance_criteria\": list of measurable acceptance criteria that verify the specific details\n\
             - \"ontology\": list of {{ \"name\": \"\", \"entity_type\": \"\", \"description\": \"\" }} domain entities"
        );

        let raw = self.llm_complete(system_prompt, &user_message).await?;
        let parsed: SeedResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
            tracing::warn!(error = %e, "Failed to parse seed LLM response, using degraded fallback");
            // Use degraded_seed() to preserve user intent from the interview
            let degraded = crate::degraded::degraded_seed(interview);
            SeedResponse {
                goal: degraded.goal,
                constraints: degraded.constraints,
                acceptance_criteria: degraded.acceptance_criteria,
                ontology: degraded.ontology,
            }
        });

        let seed = Seed {
            id: uuid::Uuid::new_v4(),
            goal: parsed.goal,
            constraints: parsed.constraints,
            acceptance_criteria: parsed.acceptance_criteria,
            ontology: parsed.ontology,
            created_at: Utc::now(),
            generation: 0,
            parent_seed_id: None,
            cspace_hint: None,
            original_request: interview.original_message.clone(),
            output_schema: None,
            project_id: None,
            workspace_context: None,
            mount_paths: Vec::new(),
        };

        tracing::info!(seed_id = %seed.id, goal = %seed.goal, "Seed generated");
        Ok(seed)
    }

    async fn execute(&self, seed: &Seed) -> Result<ExecutionResult> {
        self.set_phase(Phase::Execute);
        // Execution is delegated to the kernel's AgentRuntime via the Supervisor.
        // The OuroborosEngine itself does not run tools — it orchestrates.
        // The Orchestrator calls Supervisor::run_with_seed() directly.
        // This method exists for protocol completeness but the Orchestrator
        // does not invoke it; it uses the Supervisor instead.
        tracing::info!(seed_id = %seed.id, "Execute phase (delegated to AgentRuntime via Supervisor)");
        Ok(ExecutionResult {
            output: format!("Execution of seed {} delegated to agent runtime", seed.id),
            steps_completed: 0,
            success: false, // Caller should replace with actual result
            tool_calls: vec![],
            tokens_input: 0,
            tokens_output: 0,
            model_id: String::new(),
        })
    }

    async fn evaluate(&self, seed: &Seed, execution: &ExecutionResult) -> Result<EvaluationResult> {
        self.set_phase(Phase::Evaluate);

        // Check cache first
        if let Some(cached) = self.eval_cache.get(seed, execution) {
            tracing::info!(seed_id = %seed.id, "Evaluation cache hit");
            return Ok(cached);
        }

        // Stage 1: Enhanced mechanical evaluation (language-agnostic)
        let mechanical = crate::evaluation::MechanicalEvalResult::evaluate(
            &seed.acceptance_criteria,
            &execution.output,
        );

        // If mechanical passes perfectly, skip LLM eval
        if mechanical.all_passed {
            let result = EvaluationResult {
                mechanical_pass: true,
                semantic_pass: None,
                consensus_pass: None,
                score: 1.0,
                notes: mechanical
                    .criterion_results
                    .iter()
                    .map(|r| format!("{}", r.criterion))
                    .collect(),
            };
            self.eval_cache.put(seed, execution, result.clone());
            tracing::info!(seed_id = %seed.id, score = 1.0, "Mechanical evaluation passed, skipping LLM");
            return Ok(result);
        }

        // Stage 2: Semantic evaluation via LLM (with retry)
        let mechanical_notes: String = mechanical
            .criterion_results
            .iter()
            .map(|r| format!("- {}: {} ({})", r.criterion, r.passed, r.reason))
            .collect::<Vec<_>>()
            .join("\n");

        let system_prompt = EVALUATE_SYSTEM_PROMPT;
        let user_message = format!(
            "## Goal\n{}\n\n## Acceptance Criteria\n{}\n\n\
             ## Mechanical Check Results\n{}\n\n\
             ## Execution Output (first 3000 chars)\n{}\n\n\
             Evaluate whether the execution output satisfies the goal and acceptance criteria.\n\
             Produce a JSON object:\n\
             - \"mechanical_pass\": {}\n\
             - \"semantic_pass\": true/false\n\
             - \"score\": 0.0 to 1.0\n\
             - \"notes\": list of evaluation notes",
            seed.goal,
            seed.acceptance_criteria
                .iter()
                .enumerate()
                .map(|(i, c)| format!("{}. {}", i + 1, c))
                .collect::<Vec<_>>()
                .join("\n"),
            mechanical_notes,
            &execution.output[..execution.output.len().min(3000)],
            mechanical.all_passed,
        );

        let result = match self
            .llm_json::<EvaluationResponse>(system_prompt, &user_message)
            .await
        {
            Ok(parsed) => {
                let r = EvaluationResult {
                    mechanical_pass: parsed.mechanical_pass,
                    semantic_pass: Some(parsed.semantic_pass),
                    consensus_pass: None,
                    score: parsed.score,
                    notes: parsed.notes,
                };
                self.eval_cache.put(seed, execution, r.clone());
                r
            }
            Err(e) => {
                tracing::warn!(error = %e, "Evaluation JSON parse failed after retry, using degraded fallback");
                crate::degraded::degraded_evaluation(seed, &execution.output, mechanical.all_passed)
            }
        };

        tracing::info!(
            seed_id = %seed.id,
            mechanical = result.mechanical_pass,
            semantic = ?result.semantic_pass,
            score = result.score,
            "Evaluation complete"
        );

        Ok(result)
    }

    async fn evolve(&self, seed: &Seed, evaluation: &EvaluationResult) -> Result<Option<Seed>> {
        self.set_phase(Phase::Evolve);

        // If the evaluation passed, no need to evolve.
        if evaluation.all_passed() && evaluation.score >= 0.8 {
            tracing::info!(seed_id = %seed.id, "Evaluation passed, no evolution needed");
            return Ok(None);
        }

        // ── Record generation for stagnation / regression detection ──
        self.record_evaluation(seed.clone(), evaluation);

        // ── Build evolution context (basic + lateral + regression) ──
        let base_context = format!(
            "## Original Seed\n\
             Goal: {}\n\
             Constraints: {}\n\
             Acceptance Criteria: {}\n\n\
             ## Evaluation Result\n\
             Mechanical pass: {}\n\
             Semantic pass: {}\n\
             Score: {}\n\
             Notes: {}",
            seed.goal,
            seed.constraints.join(", "),
            seed.acceptance_criteria
                .iter()
                .enumerate()
                .map(|(i, c)| format!("{}. {}", i + 1, c))
                .collect::<Vec<_>>()
                .join("\n"),
            evaluation.mechanical_pass,
            evaluation
                .semantic_pass
                .map(|p| p.to_string())
                .unwrap_or_else(|| "not evaluated".into()),
            evaluation.score,
            evaluation.notes.join("; "),
        );

        let mut context_blocks = vec![base_context];

        // Lateral thinking — only when stagnant (gen 2+).
        if seed.generation >= 2 {
            if let Some(pattern) = self.detect_stagnation() {
                tracing::info!(seed_id = %seed.id, pattern = ?pattern, "Stagnation detected, applying lateral thinking");

                // Collect already-tried personas before dropping the read lock.
                let tried: Vec<String> = {
                    let history = self.generation_history.read();
                    history
                        .iter()
                        .filter_map(|r| r.seed.cspace_hint.as_deref())
                        .filter(|h| h.starts_with("lateral:"))
                        .map(|h| h[8..].to_string())
                        .collect()
                };
                let tried_refs: Vec<&str> = tried.iter().map(|s| s.as_str()).collect();

                if let Some(persona) = crate::lateral::select_persona(pattern, &tried_refs) {
                    let lateral = crate::lateral::build_lateral_prompt(
                        persona,
                        &seed.goal,
                        &format!(
                            "Score={:.2}, passed={}",
                            evaluation.score,
                            evaluation.all_passed()
                        ),
                        &evaluation.notes,
                    );
                    context_blocks.push(lateral);

                    // Track the persona via cspace_hint.
                    let mut guard = self.generation_history.write();
                    if let Some(last) = guard.last_mut() {
                        last.seed.cspace_hint = Some(format!("lateral:{}", persona.name));
                    }
                }
            }

            // Regression context — always on gen 2+.
            let regressions = self.detect_regressions();
            if !regressions.is_empty() {
                let reg_text =
                    crate::regression::RegressionDetector::format_for_prompt(&regressions);
                context_blocks.push(reg_text);
                tracing::info!(
                    seed_id = %seed.id,
                    count = regressions.len(),
                    "Injecting regression context"
                );
            }
        }

        let user_message = context_blocks.join("\n\n---\n\n");

        let system_prompt = EVOLVE_SYSTEM_PROMPT;
        let raw = self.llm_complete(system_prompt, &user_message).await?;
        let parsed: SeedResponse = Self::parse_json(&raw).unwrap_or_else(|e| {
            tracing::warn!(error = %e, "Failed to parse evolve LLM response");
            SeedResponse {
                goal: seed.goal.clone(),
                constraints: seed.constraints.clone(),
                acceptance_criteria: seed.acceptance_criteria.clone(),
                ontology: seed.ontology.clone(),
            }
        });

        let evolved = Seed::evolved_from(seed);

        let new_seed = Seed {
            id: evolved.id,
            goal: parsed.goal,
            constraints: parsed.constraints,
            acceptance_criteria: parsed.acceptance_criteria,
            ontology: parsed.ontology,
            created_at: Utc::now(),
            generation: evolved.generation,
            parent_seed_id: evolved.parent_seed_id,
            cspace_hint: evolved.cspace_hint,
            original_request: seed.original_request.clone(),
            output_schema: None,
            project_id: seed.project_id,
            workspace_context: seed.workspace_context.clone(),
            mount_paths: seed.mount_paths.clone(),
        };

        tracing::info!(
            original_seed = %seed.id,
            evolved_seed = %new_seed.id,
            generation = new_seed.generation,
            "Seed evolved"
        );

        Ok(Some(new_seed))
    }
}

// ---------------------------------------------------------------------------
// System prompts
// ---------------------------------------------------------------------------

const INTERVIEW_SYSTEM_PROMPT: &str = "\
You are the Interview phase of the Ouroboros protocol. \
Your job: determine whether the user's message is a task or conversation, \
and if it's a task, assess ambiguity along three dimensions.

## Language Fidelity (CRITICAL)
You MUST match the language of the user's message in ALL output text.
- Whatever language the user uses, you use that SAME language. No exceptions.
- This applies to: questions, chat_response, structured_questions labels/descriptions.
- Never translate, paraphrase, or switch to a different language regardless of context length or turn number.

## Critical Boundaries
- NEVER propose solutions. You ask, you do not implement.
- NEVER say \"I will...\" or \"Let me...\" — you are an interviewer, not an executor.
- NEVER skip scoring. Every task gets ambiguity scores.

## Scoring Policy
Be HONEST, not generous:
- Score GOAL_CLARITY below 0.5 if the user's intent is genuinely ambiguous
- Score CONSTRAINT_CLARITY below 0.5 if no specifics are mentioned
- Score SUCCESS_CRITERIA below 0.5 if \"done\" is undefined
- Reserve 0.9+ for requests that are immediately executable as-is
- When in doubt, score LOWER — it is cheaper to ask than to guess wrong

## Conversation Detection
- Greetings, thanks, opinions, questions about capabilities → is_task: false
- Any verb implying action (create, fix, find, deploy, analyze, review) → is_task: true
- When uncertain, default to is_task: true

## Question Quality
Bad: \"Could you tell me more about your requirements?\"
Good: \"You said 'optimize the API' — optimize for latency, throughput, or cost?\"

Questions must target a SPECIFIC ambiguity, not invite a general brain dump.
Maximum 3 questions. Each must be answerable in one sentence.

Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";

const SEED_SYSTEM_PROMPT: &str = "\
You are the Seed Architect of the Ouroboros protocol. \
Your job: crystallize interview results into an immutable specification.

## Core Principle
A Seed is a CONTRACT — it will be executed by an autonomous agent without \
further human input. If the Seed is ambiguous, the execution WILL go wrong.

## Mandatory Properties
- COMPLETE: Contains EVERYTHING the agent needs. No assumed context.
- SPECIFIC: Exact filenames, paths, languages, frameworks — never \"a file\" \
  or \"the system\".
- MEASURABLE: Each acceptance criterion can be verified by running a command \
  or checking file content. No subjective criteria like \"clean code\".

## Scope Guard
Do NOT expand beyond the user's request:
- If they asked for a single function, do not spec a module
- If they specified a language, do not suggest alternatives
- If they named a file, use THAT filename, not a \"better\" one

If the interview was insufficient to produce a complete Seed, include the \
constraint: \"Requires human clarification: [what's missing]\"

Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";

const EVALUATE_SYSTEM_PROMPT: &str = "\
You are the Evaluator of the Ouroboros protocol. \
Your job: determine whether execution output satisfies the Seed specification.

## Two-Stage Evaluation

Stage 1 — Mechanical: Does the output explicitly address each acceptance criterion?
- If the agent claims to have created a file, look for the file content or path
- If the agent claims to have run a command, look for command output
- Absence of evidence = evidence of absence

Stage 2 — Semantic: Does the output actually solve the user's intent?
- The agent may check every box but still miss the point
- Look for the gap between \"technically correct\" and \"genuinely useful\"

## Scoring Policy
- 0.9–1.0: All criteria met, output is complete and correct
- 0.7–0.8: Core goal achieved, minor issues or missing optional elements
- 0.5–0.6: Partially done, significant gaps
- Below 0.5: Fundamentally failed or produced nothing useful

## Anti-Patterns (score penalty)
- Agent claims completion without showing evidence → cap at 0.5
- Agent solved a different problem than specified → cap at 0.4
- Agent made changes not in the Seed scope → flag as scope violation
- Agent output is generic/boilerplate that could apply to anything → cap at 0.3

## Evidence Requirement
Do NOT give credit for claims. Give credit for DEMONSTRATED results:
- \"I created the file\" → Show me the file content
- \"Tests pass\" → Show me the test output
- \"The bug is fixed\" → Show me before/after behavior

Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";

const EVOLVE_SYSTEM_PROMPT: &str = "\
You are the Evolve phase of the Ouroboros protocol. \
Your job: improve a Seed based on evaluation failure analysis.

## Before You Evolve
1. Read the evaluation notes carefully — WHAT failed and WHY
2. Distinguish between:
   - SPEC issues (Seed was ambiguous or incomplete) → Fix the Seed
   - EXECUTION issues (Agent misunderstood or went off-track) → Add constraints/guards
   - IMPOSSIBLE issues (Goal is infeasible as stated) → Flag for human review

## Evolution Rules
- Preserve what WORKED — do not change passing acceptance criteria
- Add constraints that prevent known failure modes
- Tighten criteria that were too vague
- If the goal itself was wrong, flag it rather than silently changing it

## Scope Guard
Evolution narrows scope, never expands it:
- Do NOT add new features the user didn't request
- Do NOT change the goal to something \"better\"
- Do NOT add acceptance criteria for problems the user didn't mention

## Stagnation Detection
If this is generation 3+ and the same issues persist:
- The Seed may be fundamentally flawed — suggest restarting the interview
- Consider whether the task needs to be decomposed into smaller Seeds

Always respond with valid JSON in the exact format requested. \
Do not include any text outside the JSON object.";

impl std::fmt::Debug for OuroborosEngine {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("OuroborosEngine")
            .field("phase", &self.phase())
            .field("model", &self.model.id)
            .finish()
    }
}