assay_core/judge/
mod.rs

1use crate::model::TestInput;
2use crate::providers::llm::LlmClient;
3use crate::storage::judge_cache::JudgeCache;
4use serde_json::json;
5use std::sync::Arc;
6
7#[derive(Clone, Debug)]
8pub struct JudgeRuntimeConfig {
9    pub enabled: bool,
10    pub provider: String, // "openai", "fake", "none"
11    pub model: Option<String>,
12    pub samples: u32,
13    pub temperature: f32,
14    pub max_tokens: u32,
15    pub refresh: bool,
16}
17
18#[derive(Clone)]
19pub struct JudgeService {
20    config: JudgeRuntimeConfig,
21    cache: JudgeCache,
22    client: Option<Arc<dyn LlmClient>>,
23}
24
25impl JudgeService {
26    pub fn new(
27        config: JudgeRuntimeConfig,
28        cache: JudgeCache,
29        client: Option<Arc<dyn LlmClient>>,
30    ) -> Self {
31        Self {
32            config,
33            cache,
34            client,
35        }
36    }
37
38    pub async fn evaluate(
39        &self,
40        test_id: &str,
41        rubric_id: &str,
42        data: &TestInput,
43        response_text: &str,
44        suite_rubric_version: Option<&str>,
45        meta: &mut serde_json::Value,
46    ) -> anyhow::Result<()> {
47        let rubric_version = suite_rubric_version.unwrap_or("v1");
48
49        // 1. Trace Check
50        if let Some(_trace_judge) = meta.pointer(&format!("/assay/judge/{}", rubric_id)) {
51            // Already present in trace
52            // We could validate it, but for now accept it.
53            // Ensure "source" is "trace" if not set?
54            return Ok(());
55        }
56
57        // 2. Judge Disabled Check
58        if !self.config.enabled {
59            anyhow::bail!(
60                "config error: test '{}' requires judge results ('{}:{}'), but judge is disabled.\n\
61                 hint: options:\n\
62                 1) run live judge: assay ci --judge openai\n\
63                 2) run replay/CI offline: provide trace meta at meta.assay.judge.{}\n\
64                 and re-run with: assay ci --trace-file traces.jsonl --no-judge",
65                test_id, rubric_id, rubric_version, rubric_id
66            );
67        }
68
69        let client = self.client.as_ref().ok_or_else(|| {
70            anyhow::anyhow!(
71                "config error: judge enabled but no client provided (verify --judge <provider>)"
72            )
73        })?;
74
75        // 3. Cache Check
76        let prompt = format!(
77            "Rubric: {}\nInput: {}\nResponse: {}\nContext: {:?}",
78            rubric_id, data.prompt, response_text, data.context
79        );
80        let input_hash = format!("{:x}", md5::compute(&prompt)); // Simple hash
81        let cache_key = self.generate_cache_key(rubric_id, rubric_version, &input_hash);
82
83        if !self.config.refresh {
84            if let Some(mut cached) = self.cache.get(&cache_key)? {
85                if let Some(obj) = cached.as_object_mut() {
86                    obj.insert("source".to_string(), json!("cache"));
87                    obj.insert(
88                        "cached_at".to_string(),
89                        json!(chrono::Utc::now().to_rfc3339()),
90                    );
91                }
92                self.inject_result(meta, rubric_id, cached)?;
93                return Ok(());
94            }
95        }
96
97        // 4. Live Call (Voting)
98        let samples = self.config.samples;
99        let mut votes = Vec::new();
100        let mut rationales = Vec::new();
101
102        for _ in 0..samples {
103            // In a real impl, we'd use the actual rubric prompt template
104            let _sys_prompt = format!("You are a judge for rubric {}. Output JSON with {{passed: bool, rationale: string}}.", rubric_id);
105            // This prompt is simplistic; strict impl would use templates.
106            let resp = client.complete(&prompt, None).await?; // Assuming prompt contains everything
107                                                              // Parse JSON
108                                                              // Mock parsing for now if fake/dummy, or try parse
109                                                              // For MVP, if client is dummy, it returns text.
110                                                              // We need to robustly parse the LLM output.
111
112            // Assume the client returns a string that contains JSON.
113            // If dummy: "hello from dummy". This won't parse.
114            // If "fake" embedder logic was here? No, client is LlmClient.
115
116            // For now, let's assume the LLM returns valid JSON or we fail.
117            // We need a proper rubric prompt construction.
118            votes.push(self.mock_vote_logic(rubric_id, &resp.text)); // Temp mock
119            rationales.push(resp.text);
120        }
121
122        // Aggregation
123        let pass_count = votes.iter().filter(|&&v| v).count() as u32;
124        let agreement = pass_count as f64 / samples as f64;
125        let passed = pass_count as f64 > (samples as f64 / 2.0); // Majority
126
127        // Status check
128        // If disagreement (agreement < 1.0), we might warn later in the Metric logic?
129        // Or store "unstable": true in meta?
130
131        let result = json!({
132            "rubric_version": rubric_version,
133            "passed": passed,
134            "score": agreement, // Score is agreement ratio? Or binary?
135            // Usually score is 1.0 (pass) or 0.0 (fail) or agreement?
136            "source": "live",
137            "samples": votes,
138            "agreement": agreement,
139            "rationale": rationales.first().cloned().unwrap_or_default(), // Take first
140            "cached_at": chrono::Utc::now().to_rfc3339()
141        });
142
143        // Store in Cache
144        self.cache.put(
145            &cache_key,
146            &self.config.provider,
147            self.config.model.as_deref().unwrap_or("default"),
148            rubric_id,
149            rubric_version,
150            &result,
151        )?;
152
153        self.inject_result(meta, rubric_id, result)?;
154
155        Ok(())
156    }
157
158    fn generate_cache_key(
159        &self,
160        rubric_id: &str,
161        rubric_version: &str,
162        input_hash: &str,
163    ) -> String {
164        // Use actual template hash if available
165        let template_version = "v1-simple";
166        let raw = format!(
167            "{}:{}:{}:{}:{}:{}:{}:{}:{}",
168            self.config.provider,
169            self.config.model.as_deref().unwrap_or(""),
170            rubric_id,
171            rubric_version,
172            self.config.temperature,
173            self.config.max_tokens,
174            self.config.samples,
175            template_version,
176            input_hash
177        );
178        format!("{:x}", md5::compute(raw))
179    }
180
181    fn inject_result(
182        &self,
183        meta: &mut serde_json::Value,
184        rubric_id: &str,
185        result: serde_json::Value,
186    ) -> anyhow::Result<()> {
187        if let Some(obj) = meta.as_object_mut() {
188            let assay = obj
189                .entry("assay")
190                .or_insert(json!({}))
191                .as_object_mut()
192                .unwrap();
193            let judge = assay
194                .entry("judge")
195                .or_insert(json!({}))
196                .as_object_mut()
197                .unwrap();
198            judge.insert(rubric_id.to_string(), result);
199        }
200        Ok(())
201    }
202
203    // logic to "mock" vote if text isn't JSON (for dev speed)
204    fn mock_vote_logic(&self, _rubric: &str, text: &str) -> bool {
205        // If "dummy" client, always pass?
206        // Or check text content
207        !text.contains("fail")
208    }
209}