async_openai/types/evals/
eval.rs

1use derive_builder::Builder;
2use serde::{Deserialize, Serialize};
3
4use crate::error::OpenAIError;
5use crate::types::chat::{ChatCompletionTool, ImageDetail, InputAudio, ResponseFormat};
6use crate::types::graders::{
7    GraderLabelModel, GraderPython, GraderScoreModel, GraderStringCheck, GraderTextSimilarity,
8};
9use crate::types::responses::{ResponseTextParam, Tool};
10use crate::types::Metadata;
11
12// Re-export commonly used types
13pub use crate::types::responses::{EasyInputMessage, InputTextContent, ReasoningEffort};
14
15/// An Eval object with a data source config and testing criteria.
16/// An Eval represents a task to be done for your LLM integration.
17/// Like:
18/// - Improve the quality of my chatbot
19/// - See how well my chatbot handles customer support
20/// - Check if o4-mini is better at my usecase than gpt-4o
21#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
22pub struct Eval {
23    /// The object type, which is always "eval".
24    pub object: String,
25    /// Unique identifier for the evaluation.
26    pub id: String,
27    /// The name of the evaluation.
28    pub name: String,
29    /// Configuration of data sources used in runs of the evaluation.
30    pub data_source_config: EvalDataSourceConfig,
31    /// A list of testing criteria.
32    pub testing_criteria: Vec<EvalTestingCriterion>,
33    /// The Unix timestamp (in seconds) for when the eval was created.
34    pub created_at: u64,
35    pub metadata: Metadata,
36}
37
38/// Configuration of data sources used in runs of the evaluation.
39#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
40#[serde(tag = "type", rename_all = "snake_case")]
41pub enum EvalDataSourceConfig {
42    /// Custom data source config.
43    Custom(EvalCustomDataSourceConfig),
44    /// Logs data source config.
45    Logs(EvalLogsDataSourceConfig),
46    /// Stored completions data source config (deprecated).
47    #[serde(rename = "stored_completions")]
48    StoredCompletions(EvalStoredCompletionsDataSourceConfig),
49}
50
51/// Custom data source config.
52#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
53pub struct EvalCustomDataSourceConfig {
54    /// The type of data source. Always "custom".
55    #[serde(rename = "type")]
56    pub r#type: String,
57    /// The json schema for the run data source items.
58    pub schema: serde_json::Value,
59}
60
61/// Logs data source config.
62#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
63pub struct EvalLogsDataSourceConfig {
64    /// The type of data source. Always "logs".
65    #[serde(rename = "type")]
66    pub r#type: String,
67    /// Metadata filters for the logs data source.
68    #[serde(skip_serializing_if = "Option::is_none")]
69    pub metadata: Option<Metadata>,
70    /// The json schema for the run data source items.
71    pub schema: serde_json::Value,
72}
73
74/// Stored completions data source config (deprecated).
75#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
76pub struct EvalStoredCompletionsDataSourceConfig {
77    /// The type of data source. Always "stored_completions".
78    #[serde(rename = "type")]
79    pub r#type: String,
80    /// Metadata filters for the stored completions data source.
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub metadata: Option<Metadata>,
83    /// The json schema for the run data source items.
84    pub schema: serde_json::Value,
85}
86
87/// A list of testing criteria.
88#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
89#[serde(tag = "type", rename_all = "snake_case")]
90pub enum EvalTestingCriterion {
91    /// Label model grader.
92    LabelModel(EvalGraderLabelModel),
93    /// String check grader.
94    StringCheck(EvalGraderStringCheck),
95    /// Text similarity grader.
96    TextSimilarity(EvalGraderTextSimilarity),
97    /// Python grader.
98    Python(EvalGraderPython),
99    /// Score model grader.
100    ScoreModel(EvalGraderScoreModel),
101}
102
103/// Label model grader.
104#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
105#[serde(transparent)]
106pub struct EvalGraderLabelModel(pub GraderLabelModel);
107
108/// String check grader.
109#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
110#[serde(transparent)]
111pub struct EvalGraderStringCheck(pub GraderStringCheck);
112
113/// Text similarity grader.
114#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
115pub struct EvalGraderTextSimilarity {
116    #[serde(flatten)]
117    pub grader: GraderTextSimilarity,
118    pub pass_threshold: f64,
119}
120
121/// Text similarity metric.
122#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
123#[serde(rename_all = "snake_case")]
124pub enum TextSimilarityMetric {
125    /// Cosine similarity.
126    Cosine,
127    /// Fuzzy match.
128    FuzzyMatch,
129    /// BLEU score.
130    Bleu,
131    /// GLEU score.
132    Gleu,
133    /// METEOR score.
134    Meteor,
135    /// ROUGE-1.
136    Rouge1,
137    /// ROUGE-2.
138    Rouge2,
139    /// ROUGE-3.
140    Rouge3,
141    /// ROUGE-4.
142    Rouge4,
143    /// ROUGE-5.
144    Rouge5,
145    /// ROUGE-L.
146    RougeL,
147}
148
149/// Python grader.
150/// also in openapi spec: GraderPython
151#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
152pub struct EvalGraderPython {
153    #[serde(flatten)]
154    pub grader: GraderPython,
155    pub pass_threshold: Option<f64>,
156}
157
158#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
159pub struct SamplingParams {
160    /// A seed value to initialize the randomness, during sampling.
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub seed: Option<i32>,
163    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
164    #[serde(skip_serializing_if = "Option::is_none")]
165    pub top_p: Option<f64>,
166    /// A higher temperature increases randomness in the outputs.
167    #[serde(skip_serializing_if = "Option::is_none")]
168    pub temperature: Option<f64>,
169    /// The maximum number of tokens the grader model may generate in its response.
170    #[serde(skip_serializing_if = "Option::is_none")]
171    pub max_completion_tokens: Option<i32>,
172    /// Optional reasoning effort parameter.
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub reasoning_effort: Option<ReasoningEffort>,
175}
176
177/// Score model grader.
178/// also in openapi spec: GraderScoreModel
179#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
180pub struct EvalGraderScoreModel {
181    #[serde(flatten)]
182    pub grader: GraderScoreModel,
183    /// The threshold for the score.
184    pub pass_threshold: Option<f64>,
185}
186
187#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
188pub struct EvalItem {
189    /// The role of the message input. One of `user`, `assistant`, `system`, or
190    /// `developer`.
191    pub role: EvalItemRole,
192    /// Inputs to the model - can contain template strings.
193    pub content: EvalItemContent,
194}
195
196/// The role of the message input.
197#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
198#[serde(rename_all = "lowercase")]
199pub enum EvalItemRole {
200    /// User role.
201    User,
202    /// Assistant role.
203    Assistant,
204    /// System role.
205    System,
206    /// Developer role.
207    Developer,
208}
209
210#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
211pub struct OutputText {
212    /// The text output from the model.
213    pub text: String,
214}
215
216#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
217pub struct InputImage {
218    /// The URL of the image input.
219    pub image_url: String,
220    /// The detail level of the image to be sent to the model. One of `high`, `low`, or `auto`.
221    /// Defaults to `auto`.
222    #[serde(skip_serializing_if = "Option::is_none")]
223    pub detail: Option<ImageDetail>,
224}
225
226/// Inputs to the model - can contain template strings.
227#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
228#[serde(tag = "type", rename_all = "snake_case")]
229pub enum EvalItemContent {
230    /// An input text content object.
231    InputText(InputTextContent),
232    /// An output text from the model.
233    OutputText(OutputText),
234    /// An image input to the model.
235    InputImage(InputImage),
236    /// An audio input to the model.
237    InputAudio(InputAudio),
238    /// An array of Input text, Input image, and Input audio
239    Array(Vec<EvalItemContent>),
240    #[serde(untagged)]
241    /// A text input to the model.
242    Text(String),
243}
244
245/// List of evals.
246#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
247pub struct EvalList {
248    /// The object type, which is always "list".
249    pub object: String,
250    /// An array of eval objects.
251    pub data: Vec<Eval>,
252    /// The identifier of the first eval in the data array.
253    pub first_id: String,
254    /// The identifier of the last eval in the data array.
255    pub last_id: String,
256    /// Indicates whether there are more evals available.
257    pub has_more: bool,
258}
259
260#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
261#[builder(name = "CreateEvalRequestArgs")]
262#[builder(pattern = "mutable")]
263#[builder(setter(into, strip_option), default)]
264#[builder(derive(Debug))]
265#[builder(build_fn(error = "OpenAIError"))]
266pub struct CreateEvalRequest {
267    /// The name of the evaluation.
268    pub name: Option<String>,
269    ///The configuration for the data source used for the evaluation runs.
270    /// Dictates the schema of the data used in the evaluation.
271    pub data_source_config: CreateEvalDataSourceConfig,
272    /// A list of graders for all eval runs in this group. Graders can reference variables in the data
273    /// source using double curly braces notation, like `{{item.variable_name}}`. To reference the model's
274    /// output, use the `sample` namespace (ie, `{{sample.output_text}}`).
275    pub testing_criteria: Vec<CreateEvalTestingCriterion>,
276    #[serde(skip_serializing_if = "Option::is_none")]
277    pub metadata: Option<Metadata>,
278}
279
280#[derive(Debug, Serialize, Clone, PartialEq)]
281#[serde(tag = "type", rename_all = "snake_case")]
282pub enum CreateEvalDataSourceConfig {
283    /// A CustomDataSourceConfig object that defines the schema for the data source used for the evaluation
284    /// runs. This schema is used to define the shape of the data that will be:
285    /// - Used to define your testing criteria and
286    /// - What data is required when creating a run
287    Custom(CreateEvalCustomDataSourceConfig),
288    /// A data source config which specifies the metadata property of your logs query.
289    /// This is usually metadata like `usecase=chatbot` or `prompt-version=v2`, etc.
290    Logs(CreateEvalLogsDataSourceConfig),
291}
292
293impl Default for CreateEvalDataSourceConfig {
294    fn default() -> Self {
295        Self::Custom(CreateEvalCustomDataSourceConfig::default())
296    }
297}
298
299#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
300#[builder(name = "CreateEvalCustomDataSourceConfigArgs")]
301#[builder(pattern = "mutable")]
302#[builder(setter(into, strip_option), default)]
303#[builder(derive(Debug))]
304#[builder(build_fn(error = "OpenAIError"))]
305pub struct CreateEvalCustomDataSourceConfig {
306    /// The json schema for each row in the data source.
307    pub item_schema: serde_json::Value,
308    /// Whether the eval should expect you to populate the sample namespace (ie, by generating responses
309    /// off of your data source).
310    #[serde(skip_serializing_if = "Option::is_none")]
311    pub include_sample_schema: Option<bool>,
312}
313
314/// Logs data source config for creating an eval.
315#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
316#[builder(name = "CreateEvalLogsDataSourceConfigArgs")]
317#[builder(pattern = "mutable")]
318#[builder(setter(into, strip_option), default)]
319#[builder(derive(Debug))]
320#[builder(build_fn(error = "OpenAIError"))]
321pub struct CreateEvalLogsDataSourceConfig {
322    /// Metadata filters for the logs data source.
323    #[serde(skip_serializing_if = "Option::is_none")]
324    pub metadata: Option<Metadata>,
325}
326
327#[derive(Debug, Serialize, Clone, PartialEq)]
328#[serde(tag = "type", rename_all = "snake_case")]
329pub enum CreateEvalTestingCriterion {
330    /// A LabelModelGrader object which uses a model to assign labels to each item
331    /// in the evaluation.
332    LabelModel(CreateEvalLabelModelGrader),
333    /// A StringCheckGrader object that performs a string comparison between input and reference using a
334    /// specified operation.
335    StringCheck(EvalGraderStringCheck),
336    /// Text similarity grader.
337    TextSimilarity(EvalGraderTextSimilarity),
338    /// Python grader.
339    Python(EvalGraderPython),
340    /// Score model grader.
341    ScoreModel(EvalGraderScoreModel),
342}
343
344/// Label model grader for creating an eval.
345#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
346#[builder(name = "CreateEvalLabelModelGraderArgs")]
347#[builder(pattern = "mutable")]
348#[builder(setter(into, strip_option), default)]
349#[builder(derive(Debug))]
350#[builder(build_fn(error = "OpenAIError"))]
351pub struct CreateEvalLabelModelGrader {
352    /// The name of the grader.
353    pub name: String,
354    /// The model to use for the evaluation. Must support structured outputs.
355    pub model: String,
356    /// A list of chat messages forming the prompt or context. May include variable references to the
357    /// `item` namespace, ie `{{item.name}}`.
358    pub input: Vec<CreateEvalItem>,
359    /// The labels to classify to each item in the evaluation.
360    pub labels: Vec<String>,
361    /// The labels that indicate a passing result. Must be a subset of labels.
362    pub passing_labels: Vec<String>,
363}
364
365#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
366pub struct SimpleInputMessage {
367    /// The role of the message.
368    pub role: String,
369    /// The content of the message.
370    pub content: String,
371}
372
373/// A chat message that makes up the prompt or context.
374#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
375#[serde(tag = "type", rename_all = "snake_case")]
376pub enum CreateEvalItem {
377    /// A message input to the model with a role indicating instruction following
378    /// hierarchy. Instructions given with the `developer` or `system` role take
379    /// precedence over instructions given with the `user` role. Messages with the
380    /// `assistant` role are presumed to have been generated by the model in previous
381    /// interactions.
382    Message(EvalItem),
383
384    /// SimpleInputMessage
385    #[serde(untagged)]
386    Simple(SimpleInputMessage),
387}
388
389/// Request to update an eval.
390#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
391#[builder(name = "UpdateEvalRequestArgs")]
392#[builder(pattern = "mutable")]
393#[builder(setter(into, strip_option), default)]
394#[builder(derive(Debug))]
395#[builder(build_fn(error = "OpenAIError"))]
396pub struct UpdateEvalRequest {
397    /// Rename the evaluation.
398    #[serde(skip_serializing_if = "Option::is_none")]
399    pub name: Option<String>,
400    /// Metadata attached to the eval.
401    #[serde(skip_serializing_if = "Option::is_none")]
402    pub metadata: Option<Metadata>,
403}
404
405/// Response from deleting an eval.
406#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
407pub struct DeleteEvalResponse {
408    /// The object type, which is always "eval.deleted".
409    pub object: String,
410    /// Whether the eval was deleted.
411    pub deleted: bool,
412    /// The ID of the deleted eval.
413    pub eval_id: String,
414}
415
416// EvalRun types
417
418/// A schema representing an evaluation run.
419#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
420pub struct EvalRun {
421    /// The object type, which is always "eval.run".
422    pub object: String,
423    /// Unique identifier for the evaluation run.
424    pub id: String,
425    /// The identifier of the associated evaluation.
426    pub eval_id: String,
427    /// The status of the evaluation run.
428    pub status: EvalRunStatus,
429    /// The model that is evaluated, if applicable.
430    pub model: String,
431    /// The name of the evaluation run.
432    pub name: String,
433    /// Unix timestamp (in seconds) when the evaluation run was created.
434    pub created_at: u64,
435    /// The URL to the rendered evaluation run report on the UI dashboard.
436    pub report_url: String,
437    /// Counters summarizing the outcomes of the evaluation run.
438    pub result_counts: EvalRunResultCounts,
439    /// Usage statistics for each model during the evaluation run.
440    pub per_model_usage: Option<Vec<EvalRunModelUsage>>,
441    /// Results per testing criteria applied during the evaluation run.
442    pub per_testing_criteria_results: Option<Vec<EvalRunTestingCriteriaResult>>,
443    /// Information about the run's data source.
444    pub data_source: EvalRunDataSource,
445    /// Metadata attached to the run.
446    pub metadata: Metadata,
447    /// Error information, if any.
448    pub error: Option<EvalApiError>,
449}
450
451/// Status of an evaluation run.
452#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
453#[serde(rename_all = "snake_case")]
454pub enum EvalRunStatus {
455    /// Queued.
456    Queued,
457    /// In progress.
458    InProgress,
459    /// Completed.
460    Completed,
461    /// Failed.
462    Failed,
463    /// Canceled.
464    Canceled,
465}
466
467/// Counters summarizing the outcomes of the evaluation run.
468#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
469pub struct EvalRunResultCounts {
470    /// Total number of executed output items.
471    pub total: u32,
472    /// Number of output items that resulted in an error.
473    pub errored: u32,
474    /// Number of output items that failed to pass the evaluation.
475    pub failed: u32,
476    /// Number of output items that passed the evaluation.
477    pub passed: u32,
478}
479
480/// Usage statistics for each model during the evaluation run.
481#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
482pub struct EvalRunModelUsage {
483    /// The name of the model.
484    pub model_name: String,
485    /// The number of invocations.
486    pub invocation_count: u32,
487    /// The number of prompt tokens used.
488    pub prompt_tokens: u32,
489    /// The number of completion tokens generated.
490    pub completion_tokens: u32,
491    /// The total number of tokens used.
492    pub total_tokens: u32,
493    /// The number of tokens retrieved from cache.
494    pub cached_tokens: u32,
495}
496
497/// Results per testing criteria applied during the evaluation run.
498#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
499pub struct EvalRunTestingCriteriaResult {
500    /// A description of the testing criteria.
501    pub testing_criteria: String,
502    /// Number of tests passed for this criteria.
503    pub passed: u32,
504    /// Number of tests failed for this criteria.
505    pub failed: u32,
506}
507
508/// Information about the run's data source.
509#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
510#[serde(tag = "type", rename_all = "snake_case")]
511pub enum EvalRunDataSource {
512    /// A JsonlRunDataSource object with that specifies a JSONL file that matches the eval
513    Jsonl(CreateEvalJsonlRunDataSource),
514    /// A CompletionsRunDataSource object describing a model sampling configuration.
515    Completions(CreateEvalCompletionsRunDataSource),
516    /// A ResponsesRunDataSource object describing a model sampling configuration.
517    Responses(CreateEvalResponsesRunDataSource),
518}
519
520/// JSONL run data source.
521#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
522pub struct CreateEvalJsonlRunDataSource {
523    /// Determines what populates the `item` namespace in the data source.
524    pub source: EvalJsonlSource,
525}
526
527/// JSONL source.
528#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
529#[serde(tag = "type", rename_all = "snake_case")]
530pub enum EvalJsonlSource {
531    /// File content source.
532    FileContent(EvalJsonlFileContentSource),
533    /// File ID source.
534    FileId(EvalJsonlFileIdSource),
535}
536
537/// JSONL file content source.
538#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
539pub struct EvalJsonlFileContentSource {
540    /// The content of the jsonl file.
541    pub content: Vec<EvalJsonlContentItem>,
542}
543
544/// JSONL file ID source.
545#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
546pub struct EvalJsonlFileIdSource {
547    /// The identifier of the file.
548    pub id: String,
549}
550
551/// JSONL content item.
552#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
553pub struct EvalJsonlContentItem {
554    /// The item data.
555    pub item: serde_json::Value,
556    /// The sample data, if any.
557    #[serde(skip_serializing_if = "Option::is_none")]
558    pub sample: Option<serde_json::Value>,
559}
560
561/// Completions run data source.
562#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
563pub struct CreateEvalCompletionsRunDataSource {
564    /// Used when sampling from a model. Dictates the structure of the messages passed into the model. Can
565    /// either be a reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template with
566    /// variable references to the `item` namespace.
567    pub input_messages: EvalInputMessages,
568    /// The sampling parameters for the model.
569    #[serde(skip_serializing_if = "Option::is_none")]
570    pub sampling_params: Option<EvalSamplingParams>,
571    /// The name of the model to use for generating completions (e.g. "o3-mini").
572    pub model: String,
573    /// Determines what populates the `item` namespace in this run's data source.
574    pub source: EvalCompletionsSource,
575}
576
577#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
578pub struct TemplateInputMessages {
579    /// A list of chat messages forming the prompt or context. May include variable references to
580    /// the `item` namespace, ie {{item.name}}.
581    pub template: Vec<CreateEvalItem>,
582}
583
584#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
585pub struct ItemReference {
586    /// A reference to a variable in the `item` namespace. Ie, "item.input_trajectory"
587    pub item_reference: String,
588}
589
590/// Input messages for completions.
591#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
592#[serde(tag = "type", rename_all = "snake_case")]
593pub enum EvalInputMessages {
594    /// Template input messages.
595    Template(TemplateInputMessages),
596    /// Item reference input messages.
597    ItemReference(ItemReference),
598}
599
600/// Sampling parameters for the model.
601#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
602pub struct EvalSamplingParams {
603    /// A seed value to initialize the randomness, during sampling.
604    #[serde(skip_serializing_if = "Option::is_none")]
605    pub seed: Option<i32>,
606    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
607    #[serde(skip_serializing_if = "Option::is_none")]
608    pub top_p: Option<f64>,
609    /// A higher temperature increases randomness in the outputs.
610    #[serde(skip_serializing_if = "Option::is_none")]
611    pub temperature: Option<f64>,
612    /// The maximum number of tokens in the generated output.
613    #[serde(skip_serializing_if = "Option::is_none")]
614    pub max_completion_tokens: Option<i32>,
615    /// Optional reasoning effort parameter.
616    #[serde(skip_serializing_if = "Option::is_none")]
617    pub reasoning_effort: Option<ReasoningEffort>,
618    /// An object specifying the format that the model must output.
619    #[serde(skip_serializing_if = "Option::is_none")]
620    pub response_format: Option<ResponseFormat>,
621    /// A list of tools the model may call.
622    #[serde(skip_serializing_if = "Option::is_none")]
623    pub tools: Option<Vec<ChatCompletionTool>>,
624}
625
626#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
627pub struct EvalResponsesSamplingParams {
628    /// A seed value to initialize the randomness, during sampling.
629    #[serde(skip_serializing_if = "Option::is_none")]
630    pub seed: Option<i32>,
631    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
632    #[serde(skip_serializing_if = "Option::is_none")]
633    pub top_p: Option<f64>,
634    /// A higher temperature increases randomness in the outputs.
635    #[serde(skip_serializing_if = "Option::is_none")]
636    pub temperature: Option<f64>,
637    /// The maximum number of tokens in the generated output.
638    #[serde(skip_serializing_if = "Option::is_none")]
639    pub max_completion_tokens: Option<u32>,
640    /// Optional reasoning effort parameter.
641    #[serde(skip_serializing_if = "Option::is_none")]
642    pub reasoning_effort: Option<ReasoningEffort>,
643    /// An object specifying the format that the model must output.
644    #[serde(skip_serializing_if = "Option::is_none")]
645    pub response_format: Option<ResponseFormat>,
646    /// A list of tools the model may call.
647    #[serde(skip_serializing_if = "Option::is_none")]
648    pub tools: Option<Vec<Tool>>,
649    /// Configuration options for a text response from the model. Can be plain
650    /// text or structured JSON data. Learn more:
651    /// - [Text inputs and outputs](https://platform.openai.com/docs/guides/text)
652    /// - [Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs)
653    #[serde(skip_serializing_if = "Option::is_none")]
654    pub text: Option<ResponseTextParam>,
655}
656
657/// Completions source.
658#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
659#[serde(tag = "type", rename_all = "snake_case")]
660pub enum EvalCompletionsSource {
661    /// File content source.
662    FileContent(EvalJsonlFileContentSource),
663    /// File ID source.
664    FileId(EvalJsonlFileIdSource),
665    /// Stored completions source.
666    StoredCompletions(EvalStoredCompletionsSource),
667}
668
669/// Stored completions source.
670#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
671pub struct EvalStoredCompletionsSource {
672    /// Metadata filters for the stored completions.
673    #[serde(skip_serializing_if = "Option::is_none")]
674    pub metadata: Option<Metadata>,
675    /// An optional model to filter by.
676    #[serde(skip_serializing_if = "Option::is_none")]
677    pub model: Option<String>,
678    /// An optional Unix timestamp to filter items created after this time.
679    #[serde(skip_serializing_if = "Option::is_none")]
680    pub created_after: Option<i64>,
681    /// An optional Unix timestamp to filter items created before this time.
682    #[serde(skip_serializing_if = "Option::is_none")]
683    pub created_before: Option<i64>,
684    /// An optional maximum number of items to return.
685    #[serde(skip_serializing_if = "Option::is_none")]
686    pub limit: Option<i32>,
687}
688
689/// Responses run data source.
690#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
691pub struct CreateEvalResponsesRunDataSource {
692    /// Used when sampling from a model. Dictates the structure of the messages passed into the model.
693    #[serde(skip_serializing_if = "Option::is_none")]
694    pub input_messages: Option<EvalInputMessages>,
695    /// The sampling parameters for the model.
696    #[serde(skip_serializing_if = "Option::is_none")]
697    pub sampling_params: Option<EvalResponsesSamplingParams>,
698    #[serde(skip_serializing_if = "Option::is_none")]
699    pub model: Option<String>,
700    /// Determines what populates the `item` namespace in this run's data source.
701    pub source: EvalResponsesRunSource,
702}
703
704/// Responses source.
705#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
706#[serde(tag = "type", rename_all = "snake_case")]
707pub enum EvalResponsesRunSource {
708    /// File content source.
709    FileContent(EvalJsonlFileContentSource),
710    /// File ID source.
711    FileId(EvalJsonlFileIdSource),
712    /// A EvalResponsesSource object describing a run data source configuration.
713    Responses(EvalResponsesSource),
714}
715
716/// A EvalResponsesSource object describing a run data source configuration.
717#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
718pub struct EvalResponsesSource {
719    /// Metadata filter for the responses. This is a query parameter used to select responses.
720    #[serde(skip_serializing_if = "Option::is_none")]
721    pub metadata: Option<serde_json::Value>,
722    /// The name of the model to find responses for. This is a query parameter used to select responses.
723    #[serde(skip_serializing_if = "Option::is_none")]
724    pub model: Option<String>,
725    /// Optional string to search the 'instructions' field. This is a query parameter used to select responses.
726    #[serde(skip_serializing_if = "Option::is_none")]
727    pub instructions_search: Option<String>,
728    /// Only include items created after this timestamp (inclusive). This is a query parameter used to select responses.
729    #[serde(skip_serializing_if = "Option::is_none")]
730    pub created_after: Option<u64>,
731    /// Only include items created before this timestamp (inclusive). This is a query parameter used to select responses.
732    #[serde(skip_serializing_if = "Option::is_none")]
733    pub created_before: Option<u64>,
734    /// Optional reasoning effort parameter. This is a query parameter used to select responses.
735    #[serde(skip_serializing_if = "Option::is_none")]
736    pub reasoning_effort: Option<ReasoningEffort>,
737    /// Sampling temperature. This is a query parameter used to select responses.
738    #[serde(skip_serializing_if = "Option::is_none")]
739    pub temperature: Option<f64>,
740    /// Nucleus sampling parameter. This is a query parameter used to select responses.
741    #[serde(skip_serializing_if = "Option::is_none")]
742    pub top_p: Option<f64>,
743    /// List of user identifiers. This is a query parameter used to select responses.
744    #[serde(skip_serializing_if = "Option::is_none")]
745    pub users: Option<Vec<String>>,
746    /// List of tool names. This is a query parameter used to select responses.
747    #[serde(skip_serializing_if = "Option::is_none")]
748    pub tools: Option<Vec<String>>,
749}
750
751/// List of eval runs.
752#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
753pub struct EvalRunList {
754    /// The object type, which is always "list".
755    pub object: String,
756    /// An array of eval run objects.
757    pub data: Vec<EvalRun>,
758    /// The identifier of the first eval run in the data array.
759    pub first_id: String,
760    /// The identifier of the last eval run in the data array.
761    pub last_id: String,
762    /// Indicates whether there are more evals available.
763    pub has_more: bool,
764}
765
766/// Request to create an eval run.
767#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
768#[builder(name = "CreateEvalRunRequestArgs")]
769#[builder(pattern = "mutable")]
770#[builder(setter(into, strip_option), default)]
771#[builder(derive(Debug))]
772#[builder(build_fn(error = "OpenAIError"))]
773pub struct CreateEvalRunRequest {
774    /// The name of the run.
775    #[serde(skip_serializing_if = "Option::is_none")]
776    pub name: Option<String>,
777    /// Details about the run's data source.
778    pub data_source: CreateEvalRunDataSource,
779    /// Metadata attached to the run.
780    #[serde(skip_serializing_if = "Option::is_none")]
781    pub metadata: Option<Metadata>,
782}
783
784/// Details about the run's data source.
785#[derive(Debug, Serialize, Clone, PartialEq)]
786#[serde(tag = "type", rename_all = "snake_case")]
787pub enum CreateEvalRunDataSource {
788    /// JSONL data source.
789    Jsonl(CreateEvalJsonlRunDataSource),
790    /// Completions data source.
791    Completions(CreateEvalCompletionsRunDataSource),
792    /// Responses data source.
793    Responses(CreateEvalResponsesRunDataSource),
794}
795
796// Manual Default implementation for Builder compatibility
797impl Default for CreateEvalRunDataSource {
798    fn default() -> Self {
799        todo!()
800    }
801}
802
803/// Response from deleting an eval run.
804#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
805pub struct DeleteEvalRunResponse {
806    /// The object type, which is always "eval.run.deleted".
807    pub object: String,
808    /// Whether the eval run was deleted.
809    pub deleted: bool,
810    /// The ID of the deleted eval run.
811    pub run_id: String,
812}
813
814// EvalRunOutputItem types
815
816/// A schema representing an evaluation run output item.
817#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
818pub struct EvalRunOutputItem {
819    /// The object type, which is always "eval.run.output_item".
820    pub object: String,
821    /// Unique identifier for the evaluation run output item.
822    pub id: String,
823    /// The identifier of the evaluation run associated with this output item.
824    pub run_id: String,
825    /// The identifier of the evaluation group.
826    pub eval_id: String,
827    /// Unix timestamp (in seconds) when the evaluation run was created.
828    pub created_at: i64,
829    /// The status of the evaluation run.
830    pub status: String,
831    /// The identifier for the data source item.
832    pub datasource_item_id: u64,
833    /// Details of the input data source item.
834    pub datasource_item: serde_json::Value,
835    /// A list of grader results for this output item.
836    pub results: Vec<EvalRunOutputItemResult>,
837    /// A sample containing the input and output of the evaluation run.
838    pub sample: EvalRunOutputItemSample,
839}
840
841/// A single grader result for an evaluation run output item.
842#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
843pub struct EvalRunOutputItemResult {
844    /// The name of the grader.
845    pub name: String,
846    /// The numeric score produced by the grader.
847    pub score: f64,
848    /// Whether the grader considered the output a pass.
849    pub passed: bool,
850    /// Optional sample or intermediate data produced by the grader.
851    #[serde(skip_serializing_if = "Option::is_none")]
852    pub sample: Option<serde_json::Value>,
853}
854
855#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
856pub struct SimpleOutputMessage {
857    pub role: String,
858    pub content: String,
859}
860
861/// A sample containing the input and output of the evaluation run.
862#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
863pub struct EvalRunOutputItemSample {
864    /// An array of input messages.
865    pub input: Vec<SimpleInputMessage>,
866    /// An array of output messages.
867    pub output: Vec<SimpleOutputMessage>,
868    /// The reason why the sample generation was finished.
869    pub finish_reason: String,
870    /// The model used for generating the sample.
871    pub model: String,
872    /// Token usage details for the sample.
873    pub usage: EvalRunOutputItemUsage,
874    /// Error information, if any.
875    pub error: Option<EvalApiError>,
876    /// The sampling temperature used.
877    pub temperature: f64,
878    /// The maximum number of tokens allowed for completion.
879    pub max_completion_tokens: i32,
880    /// The top_p value used for sampling.
881    pub top_p: f64,
882    /// The seed used for generating the sample.
883    pub seed: i32,
884}
885
886/// Token usage details for the sample.
887#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
888pub struct EvalRunOutputItemUsage {
889    /// The total number of tokens used.
890    pub total_tokens: i32,
891    /// The number of completion tokens generated.
892    pub completion_tokens: i32,
893    /// The number of prompt tokens used.
894    pub prompt_tokens: i32,
895    /// The number of tokens retrieved from cache.
896    pub cached_tokens: i32,
897}
898
899/// List of eval run output items.
900#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
901pub struct EvalRunOutputItemList {
902    /// The object type, which is always "list".
903    pub object: String,
904    /// An array of eval run output item objects.
905    pub data: Vec<EvalRunOutputItem>,
906    /// The identifier of the first eval run output item in the data array.
907    pub first_id: String,
908    /// The identifier of the last eval run output item in the data array.
909    pub last_id: String,
910    /// Indicates whether there are more eval run output items available.
911    pub has_more: bool,
912}
913
914/// An object representing an error response from the Eval API.
915#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
916pub struct EvalApiError {
917    /// The error code.
918    pub code: String,
919    /// The error message.
920    pub message: String,
921}