assay_core/validate/
mod.rs

1use crate::config::path_resolver::PathResolver;
2use crate::errors::diagnostic::{codes, Diagnostic};
3use crate::model::EvalConfig;
4use crate::model::Expected;
5use crate::providers::llm::LlmClient; // Import trait for .complete()
6use crate::providers::trace::TraceClient;
7use std::path::{Path, PathBuf};
8
9#[derive(Debug, Clone)]
10pub struct ValidateOptions {
11    pub trace_file: Option<PathBuf>,
12    pub baseline_file: Option<PathBuf>,
13    pub replay_strict: bool,
14}
15
16#[derive(Debug, Clone, Default)]
17pub struct ValidateReport {
18    pub diagnostics: Vec<Diagnostic>,
19}
20
21pub async fn validate(
22    cfg: &EvalConfig,
23    opts: &ValidateOptions,
24    resolver: &PathResolver,
25) -> anyhow::Result<ValidateReport> {
26    let mut diags = Vec::new();
27
28    // 1. Path Resolution Checks (E_PATH_NOT_FOUND)
29    // Actually the CLI loader does this, but we can double check config assets if any.
30    // For now, let's assume config is loaded correctly if we are here,
31    // but check the explicitly provided trace/baseline files if they exist.
32
33    if let Some(path) = &opts.trace_file {
34        if !path.exists() {
35            diags.push(
36                Diagnostic::new(
37                    codes::E_PATH_NOT_FOUND,
38                    format!("Trace file not found: {}", path.display()),
39                )
40                .with_context(serde_json::json!({ "path": path }))
41                .with_source("validate")
42                .with_fix_step("Ensure the --trace-file path is correct and accessible"),
43            );
44        }
45    }
46
47    if let Some(path) = &opts.baseline_file {
48        if !path.exists() {
49            diags.push(
50                Diagnostic::new(
51                    codes::E_PATH_NOT_FOUND,
52                    format!("Baseline file not found: {}", path.display()),
53                )
54                .with_context(serde_json::json!({ "path": path }))
55                .with_source("validate")
56                .with_fix_step("Ensure the --baseline path is correct and accessible"),
57            );
58        }
59    }
60
61    // Return early if basic files missing to avoid noise
62    if !diags.is_empty() {
63        return Ok(ValidateReport { diagnostics: diags });
64    }
65
66    // 2. Load Trace & Baseline for deeper checks
67    let trace_client = if let Some(path) = &opts.trace_file {
68        match TraceClient::from_path(path) {
69            Ok(client) => Some(client),
70            Err(e) => {
71                diags.push(
72                    Diagnostic::new(
73                        codes::E_TRACE_INVALID,
74                        format!("Failed to parse trace file: {}", e),
75                    )
76                    .with_source("trace")
77                    .with_context(serde_json::json!({ "path": path, "error": e.to_string() })),
78                );
79                return Ok(ValidateReport { diagnostics: diags });
80            }
81        }
82    } else {
83        None
84    };
85
86    let baseline = if let Some(path) = &opts.baseline_file {
87        match crate::baseline::Baseline::load(path) {
88            Ok(b) => Some(b),
89            Err(e) => {
90                diags.push(
91                    Diagnostic::new(
92                        codes::E_BASE_MISMATCH,
93                        format!("Failed to parse baseline: {}", e),
94                    )
95                    .with_source("baseline")
96                    .with_context(serde_json::json!({ "path": path, "error": e.to_string() })),
97                );
98                return Ok(ValidateReport { diagnostics: diags });
99            }
100        }
101    } else {
102        None
103    };
104
105    // 3. Trace Coverage (E_TRACE_MISS)
106    if let Some(client) = &trace_client {
107        for tc in &cfg.tests {
108            // We use the same lookup logic as TraceClient::complete
109            // But here we want to collect ALL misses, not just fail on first.
110            // Since `complete` is not exposed as "check only", we iterate.
111            // Actually TraceClient doesn't expose keys publicly yet.
112            // We might need to call complete and catch error?
113            // OR better: call complete() on client. Since it returns LlmResponse or Err(Diagnostic)
114
115            let res = client
116                .complete(&tc.input.prompt, tc.input.context.as_deref())
117                .await;
118            if let Err(e) = res {
119                // If it's a diagnostic, push it.
120                // We use try_map_error from errors module
121                if let Some(diag) = crate::errors::try_map_error(&e) {
122                    // Enrich with test_id
123                    let mut d = diag.clone();
124                    if let serde_json::Value::Object(ref mut map) = d.context {
125                        map.insert("test_id".into(), serde_json::json!(tc.id));
126                        map.insert("trace_file".into(), serde_json::json!(opts.trace_file));
127                    }
128                    d.source = "trace".to_string();
129                    diags.push(d);
130                } else {
131                    // Unexpected error?
132                    diags.push(
133                        Diagnostic::new("E_UNKNOWN", format!("Unexpected trace error: {}", e))
134                            .with_source("trace"),
135                    );
136                }
137            } else if let Ok(resp) = res {
138                // Check Strict Replay (Requirement 4)
139                if opts.replay_strict {
140                    validate_strict_requirements(tc, &resp, &mut diags, opts.trace_file.as_deref());
141                }
142
143                // Check Embedding Dims (Requirement 5)
144                // This is checking per-test, potentially spammy.
145                // Better to check once per trace? But we don't have access to all embeddings.
146                // We'll check via response meta if available.
147                check_embedding_dims(&resp, &mut diags, opts.trace_file.as_deref());
148
149                // Check Policy (Requirement 2: ArgsValid)
150                if let Expected::ArgsValid {
151                    policy: Some(policy_path),
152                    ..
153                } = &tc.expected
154                {
155                    // 1. Load Policy
156                    // For now, load fully. In future, cache via resolver.
157                    // We need to resolve relative to config?
158                    // resolver.resolve_path(policy_path)?
159                    let mut p_str = policy_path.clone();
160                    resolver.resolve_str(&mut p_str);
161                    let policy_file = std::path::PathBuf::from(p_str);
162                    if !policy_file.exists() {
163                        diags.push(
164                            Diagnostic::new(
165                                codes::E_PATH_NOT_FOUND,
166                                format!("Policy file not found: {}", policy_file.display()),
167                            )
168                            .with_source("validate")
169                            .with_context(serde_json::json!({ "path": policy_file })),
170                        );
171                    } else {
172                        match crate::model::Policy::load(&policy_file) {
173                            Ok(pol) => {
174                                // 2. Get Tool Calls from Trace
175                                let tool_calls =
176                                    resp.meta.get("tool_calls").and_then(|v| v.as_array());
177
178                                if let Some(calls) = tool_calls {
179                                    // Convert to policy value for engine
180                                    let policy_val = serde_json::to_value(
181                                        pol.tools.arg_constraints.unwrap_or_default(),
182                                    )
183                                    .unwrap_or(serde_json::Value::Null);
184
185                                    // Check for Allowed/Denied lists first?
186                                    // Let's use simple policy_engine:evaluate_tool_args which expects JSON schema map.
187                                    // Wait, Policy struct has complex structure.
188                                    // policy.tools.arg_constraints is Map<Tool, Schema>.
189                                    // policy.tools.allow/deny are lists.
190
191                                    // Simplified validation for v1.2.1: Just check args against schema if present.
192                                    // Detailed enforcement requires full policy engine context (TODO for v1.3)
193
194                                    for call in calls {
195                                        let tool_name = call
196                                            .get("tool_name")
197                                            .and_then(|s| s.as_str())
198                                            .unwrap_or("unknown");
199                                        let args =
200                                            call.get("args").unwrap_or(&serde_json::Value::Null);
201
202                                        // Need to construct the "policy" value expected by evaluate_tool_args
203                                        // It expects { "ToolName": Schema, ... }
204                                        // This is exactly `arg_constraints`.
205
206                                        let verdict = crate::policy_engine::evaluate_tool_args(
207                                            &policy_val,
208                                            tool_name,
209                                            args,
210                                        );
211
212                                        if let crate::policy_engine::VerdictStatus::Blocked =
213                                            verdict.status
214                                        {
215                                            let mut d = Diagnostic::new(
216                                                verdict.reason_code,
217                                                "Policy violation in tool call",
218                                            )
219                                            .with_source("policy")
220                                            .with_context(verdict.details);
221
222                                            // Add trace context
223                                            if let serde_json::Value::Object(ref mut map) =
224                                                d.context
225                                            {
226                                                map.insert("tool".into(), tool_name.into());
227                                                map.insert("test_id".into(), tc.id.clone().into());
228                                            }
229                                            diags.push(d);
230                                        }
231                                    }
232                                } else {
233                                    // No tool calls found in trace?
234                                    // If policy expects validation, maybe warn?
235                                }
236                            }
237                            Err(e) => {
238                                diags.push(
239                                    Diagnostic::new(
240                                        codes::E_CFG_PARSE,
241                                        format!("Failed to parse policy: {}", e),
242                                    )
243                                    .with_source("policy"),
244                                );
245                            }
246                        }
247                    }
248                }
249            }
250        }
251    }
252
253    // Baseline Compat (Requirement 3)
254    if let Some(base) = &baseline {
255        if base.suite != cfg.suite {
256            diags.push(
257                Diagnostic::new(codes::E_BASE_MISMATCH, "Baseline suite mismatch")
258                    .with_source("baseline")
259                    .with_context(serde_json::json!({
260                        "expected_suite": cfg.suite,
261                        "baseline_suite": base.suite,
262                        "baseline_file": opts.baseline_file
263                    }))
264                    .with_fix_step("Use the baseline file created for this suite")
265                    .with_fix_step("Or export a new baseline: assay ci ... --export-baseline ..."),
266            );
267        }
268    }
269
270    // Deduplicate diagnostics?
271    // E_EMB_DIMS might be spammy if every test fails.
272    // Simple dedup by code + message signature could be added later.
273
274    Ok(ValidateReport { diagnostics: diags })
275}
276
277fn validate_strict_requirements(
278    tc: &crate::model::TestCase,
279    resp: &crate::model::LlmResponse,
280    diags: &mut Vec<Diagnostic>,
281    trace_path: Option<&Path>,
282) {
283    let mut missing = Vec::new();
284
285    // Check Semantic Metrics -> Need Embeddings
286    if let Expected::SemanticSimilarityTo { .. } = &tc.expected {
287        if resp.meta.pointer("/assay/embeddings/response").is_none() {
288            missing.push(serde_json::json!({
289                "requirement": "embeddings",
290                "needed_by": ["semantic_similarity_to"],
291                "meta_path": "meta.assay.embeddings"
292            }));
293        }
294    }
295
296    // Check Judge -> Need Judge Results
297    // Only if expected is Faithfulness or Relevance
298    match &tc.expected {
299        Expected::Faithfulness { .. } => {
300            if resp.meta.pointer("/assay/judge/faithfulness").is_none() {
301                missing.push(serde_json::json!({
302                    "requirement": "judge_faithfulness",
303                    "needed_by": ["faithfulness"],
304                    "meta_path": "meta.assay.judge.faithfulness"
305                }));
306            }
307        }
308        Expected::Relevance { .. } => {
309            if resp.meta.pointer("/assay/judge/relevance").is_none() {
310                missing.push(serde_json::json!({
311                    "requirement": "judge_relevance",
312                    "needed_by": ["relevance"],
313                    "meta_path": "meta.assay.judge.relevance"
314                }));
315            }
316        }
317        _ => {}
318    }
319
320    if !missing.is_empty() {
321        diags.push(
322            Diagnostic::new(
323                codes::E_REPLAY_STRICT_MISSING,
324                "Strict replay requires precomputed data that is missing from trace",
325            )
326            .with_source("replay")
327            .with_context(serde_json::json!({
328                "replay_strict": true,
329                "trace_file": trace_path,
330                "missing": missing,
331                "test_id": tc.id
332            }))
333            .with_fix_step("Run `assay trace precompute-embeddings ...`")
334            .with_fix_step("Run `assay trace precompute-judge ...`"),
335        );
336    }
337}
338
339fn check_embedding_dims(
340    resp: &crate::model::LlmResponse,
341    diags: &mut Vec<Diagnostic>,
342    trace_path: Option<&Path>,
343) {
344    // Basic heuristic: if we have embeddings, check simple consistency?
345    // Or if we know expected model?
346    // For now, looking for obvious bad data (empty vectors)
347    // Or strict mismatch if we ever passed an embedder config (not available here yet).
348
349    if let Some(embeddings) = resp
350        .meta
351        .pointer("/assay/embeddings")
352        .and_then(|v| v.as_object())
353    {
354        if let Some(response_vec) = embeddings.get("response").and_then(|v| v.as_array()) {
355            if response_vec.is_empty() {
356                diags.push(
357                    Diagnostic::new(codes::E_EMB_DIMS, "Empty embedding vector found in trace")
358                        .with_source("trace")
359                        .with_context(serde_json::json!({ "trace_file": trace_path }))
360                        .with_fix_step("Regenerate embeddings with precompute-embeddings"),
361                );
362            }
363        }
364    }
365}
assay_core/validate/mod.rs

assay_core/validate/
mod.rs