async_openai/types/evals/
eval.rs

1use derive_builder::Builder;
2use serde::{Deserialize, Serialize};
3
4use crate::error::OpenAIError;
5use crate::types::graders::{
6    GraderLabelModel, GraderPython, GraderScoreModel, GraderStringCheck, GraderTextSimilarity,
7};
8use crate::types::responses::{ResponseTextParam, Tool};
9use crate::types::{ChatCompletionTool, ImageDetail, InputAudio, Metadata, ResponseFormat};
10
11// Re-export commonly used types
12pub use crate::types::responses::{EasyInputMessage, InputTextContent, ReasoningEffort};
13
14/// An Eval object with a data source config and testing criteria.
15/// An Eval represents a task to be done for your LLM integration.
16/// Like:
17/// - Improve the quality of my chatbot
18/// - See how well my chatbot handles customer support
19/// - Check if o4-mini is better at my usecase than gpt-4o
20#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
21pub struct Eval {
22    /// The object type, which is always "eval".
23    pub object: String,
24    /// Unique identifier for the evaluation.
25    pub id: String,
26    /// The name of the evaluation.
27    pub name: String,
28    /// Configuration of data sources used in runs of the evaluation.
29    pub data_source_config: EvalDataSourceConfig,
30    /// A list of testing criteria.
31    pub testing_criteria: Vec<EvalTestingCriterion>,
32    /// The Unix timestamp (in seconds) for when the eval was created.
33    pub created_at: u64,
34    pub metadata: Metadata,
35}
36
37/// Configuration of data sources used in runs of the evaluation.
38#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
39#[serde(tag = "type", rename_all = "snake_case")]
40pub enum EvalDataSourceConfig {
41    /// Custom data source config.
42    Custom(EvalCustomDataSourceConfig),
43    /// Logs data source config.
44    Logs(EvalLogsDataSourceConfig),
45    /// Stored completions data source config (deprecated).
46    #[serde(rename = "stored_completions")]
47    StoredCompletions(EvalStoredCompletionsDataSourceConfig),
48}
49
50/// Custom data source config.
51#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
52pub struct EvalCustomDataSourceConfig {
53    /// The type of data source. Always "custom".
54    #[serde(rename = "type")]
55    pub r#type: String,
56    /// The json schema for the run data source items.
57    pub schema: serde_json::Value,
58}
59
60/// Logs data source config.
61#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
62pub struct EvalLogsDataSourceConfig {
63    /// The type of data source. Always "logs".
64    #[serde(rename = "type")]
65    pub r#type: String,
66    /// Metadata filters for the logs data source.
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub metadata: Option<Metadata>,
69    /// The json schema for the run data source items.
70    pub schema: serde_json::Value,
71}
72
73/// Stored completions data source config (deprecated).
74#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
75pub struct EvalStoredCompletionsDataSourceConfig {
76    /// The type of data source. Always "stored_completions".
77    #[serde(rename = "type")]
78    pub r#type: String,
79    /// Metadata filters for the stored completions data source.
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub metadata: Option<Metadata>,
82    /// The json schema for the run data source items.
83    pub schema: serde_json::Value,
84}
85
86/// A list of testing criteria.
87#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
88#[serde(tag = "type", rename_all = "snake_case")]
89pub enum EvalTestingCriterion {
90    /// Label model grader.
91    LabelModel(EvalGraderLabelModel),
92    /// String check grader.
93    StringCheck(EvalGraderStringCheck),
94    /// Text similarity grader.
95    TextSimilarity(EvalGraderTextSimilarity),
96    /// Python grader.
97    Python(EvalGraderPython),
98    /// Score model grader.
99    ScoreModel(EvalGraderScoreModel),
100}
101
102/// Label model grader.
103#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
104#[serde(transparent)]
105pub struct EvalGraderLabelModel(pub GraderLabelModel);
106
107/// String check grader.
108#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
109#[serde(transparent)]
110pub struct EvalGraderStringCheck(pub GraderStringCheck);
111
112/// Text similarity grader.
113#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
114pub struct EvalGraderTextSimilarity {
115    #[serde(flatten)]
116    pub grader: GraderTextSimilarity,
117    pub pass_threshold: f64,
118}
119
120/// Text similarity metric.
121#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
122#[serde(rename_all = "snake_case")]
123pub enum TextSimilarityMetric {
124    /// Cosine similarity.
125    Cosine,
126    /// Fuzzy match.
127    FuzzyMatch,
128    /// BLEU score.
129    Bleu,
130    /// GLEU score.
131    Gleu,
132    /// METEOR score.
133    Meteor,
134    /// ROUGE-1.
135    Rouge1,
136    /// ROUGE-2.
137    Rouge2,
138    /// ROUGE-3.
139    Rouge3,
140    /// ROUGE-4.
141    Rouge4,
142    /// ROUGE-5.
143    Rouge5,
144    /// ROUGE-L.
145    RougeL,
146}
147
148/// Python grader.
149/// also in openapi spec: GraderPython
150#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
151pub struct EvalGraderPython {
152    #[serde(flatten)]
153    pub grader: GraderPython,
154    pub pass_threshold: Option<f64>,
155}
156
157#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
158pub struct SamplingParams {
159    /// A seed value to initialize the randomness, during sampling.
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub seed: Option<i32>,
162    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
163    #[serde(skip_serializing_if = "Option::is_none")]
164    pub top_p: Option<f64>,
165    /// A higher temperature increases randomness in the outputs.
166    #[serde(skip_serializing_if = "Option::is_none")]
167    pub temperature: Option<f64>,
168    /// The maximum number of tokens the grader model may generate in its response.
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub max_completion_tokens: Option<i32>,
171    /// Optional reasoning effort parameter.
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub reasoning_effort: Option<ReasoningEffort>,
174}
175
176/// Score model grader.
177/// also in openapi spec: GraderScoreModel
178#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
179pub struct EvalGraderScoreModel {
180    #[serde(flatten)]
181    pub grader: GraderScoreModel,
182    /// The threshold for the score.
183    pub pass_threshold: Option<f64>,
184}
185
186#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
187pub struct EvalItem {
188    /// The role of the message input. One of `user`, `assistant`, `system`, or
189    /// `developer`.
190    pub role: EvalItemRole,
191    /// Inputs to the model - can contain template strings.
192    pub content: EvalItemContent,
193}
194
195/// The role of the message input.
196#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
197#[serde(rename_all = "lowercase")]
198pub enum EvalItemRole {
199    /// User role.
200    User,
201    /// Assistant role.
202    Assistant,
203    /// System role.
204    System,
205    /// Developer role.
206    Developer,
207}
208
209#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
210pub struct OutputText {
211    /// The text output from the model.
212    pub text: String,
213}
214
215#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
216pub struct InputImage {
217    /// The URL of the image input.
218    pub image_url: String,
219    /// The detail level of the image to be sent to the model. One of `high`, `low`, or `auto`.
220    /// Defaults to `auto`.
221    #[serde(skip_serializing_if = "Option::is_none")]
222    pub detail: Option<ImageDetail>,
223}
224
225/// Inputs to the model - can contain template strings.
226#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
227#[serde(tag = "type", rename_all = "snake_case")]
228pub enum EvalItemContent {
229    /// An input text content object.
230    InputText(InputTextContent),
231    /// An output text from the model.
232    OutputText(OutputText),
233    /// An image input to the model.
234    InputImage(InputImage),
235    /// An audio input to the model.
236    InputAudio(InputAudio),
237    /// An array of Input text, Input image, and Input audio
238    Array(Vec<EvalItemContent>),
239    #[serde(untagged)]
240    /// A text input to the model.
241    Text(String),
242}
243
244/// List of evals.
245#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
246pub struct EvalList {
247    /// The object type, which is always "list".
248    pub object: String,
249    /// An array of eval objects.
250    pub data: Vec<Eval>,
251    /// The identifier of the first eval in the data array.
252    pub first_id: String,
253    /// The identifier of the last eval in the data array.
254    pub last_id: String,
255    /// Indicates whether there are more evals available.
256    pub has_more: bool,
257}
258
259#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
260#[builder(name = "CreateEvalRequestArgs")]
261#[builder(pattern = "mutable")]
262#[builder(setter(into, strip_option), default)]
263#[builder(derive(Debug))]
264#[builder(build_fn(error = "OpenAIError"))]
265pub struct CreateEvalRequest {
266    /// The name of the evaluation.
267    pub name: Option<String>,
268    ///The configuration for the data source used for the evaluation runs.
269    /// Dictates the schema of the data used in the evaluation.
270    pub data_source_config: CreateEvalDataSourceConfig,
271    /// A list of graders for all eval runs in this group. Graders can reference variables in the data
272    /// source using double curly braces notation, like `{{item.variable_name}}`. To reference the model's
273    /// output, use the `sample` namespace (ie, `{{sample.output_text}}`).
274    pub testing_criteria: Vec<CreateEvalTestingCriterion>,
275    #[serde(skip_serializing_if = "Option::is_none")]
276    pub metadata: Option<Metadata>,
277}
278
279#[derive(Debug, Serialize, Clone, PartialEq)]
280#[serde(tag = "type", rename_all = "snake_case")]
281pub enum CreateEvalDataSourceConfig {
282    /// A CustomDataSourceConfig object that defines the schema for the data source used for the evaluation
283    /// runs. This schema is used to define the shape of the data that will be:
284    /// - Used to define your testing criteria and
285    /// - What data is required when creating a run
286    Custom(CreateEvalCustomDataSourceConfig),
287    /// A data source config which specifies the metadata property of your logs query.
288    /// This is usually metadata like `usecase=chatbot` or `prompt-version=v2`, etc.
289    Logs(CreateEvalLogsDataSourceConfig),
290}
291
292impl Default for CreateEvalDataSourceConfig {
293    fn default() -> Self {
294        Self::Custom(CreateEvalCustomDataSourceConfig::default())
295    }
296}
297
298#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
299#[builder(name = "CreateEvalCustomDataSourceConfigArgs")]
300#[builder(pattern = "mutable")]
301#[builder(setter(into, strip_option), default)]
302#[builder(derive(Debug))]
303#[builder(build_fn(error = "OpenAIError"))]
304pub struct CreateEvalCustomDataSourceConfig {
305    /// The json schema for each row in the data source.
306    pub item_schema: serde_json::Value,
307    /// Whether the eval should expect you to populate the sample namespace (ie, by generating responses
308    /// off of your data source).
309    #[serde(skip_serializing_if = "Option::is_none")]
310    pub include_sample_schema: Option<bool>,
311}
312
313/// Logs data source config for creating an eval.
314#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
315#[builder(name = "CreateEvalLogsDataSourceConfigArgs")]
316#[builder(pattern = "mutable")]
317#[builder(setter(into, strip_option), default)]
318#[builder(derive(Debug))]
319#[builder(build_fn(error = "OpenAIError"))]
320pub struct CreateEvalLogsDataSourceConfig {
321    /// Metadata filters for the logs data source.
322    #[serde(skip_serializing_if = "Option::is_none")]
323    pub metadata: Option<Metadata>,
324}
325
326#[derive(Debug, Serialize, Clone, PartialEq)]
327#[serde(tag = "type", rename_all = "snake_case")]
328pub enum CreateEvalTestingCriterion {
329    /// A LabelModelGrader object which uses a model to assign labels to each item
330    /// in the evaluation.
331    LabelModel(CreateEvalLabelModelGrader),
332    /// A StringCheckGrader object that performs a string comparison between input and reference using a
333    /// specified operation.
334    StringCheck(EvalGraderStringCheck),
335    /// Text similarity grader.
336    TextSimilarity(EvalGraderTextSimilarity),
337    /// Python grader.
338    Python(EvalGraderPython),
339    /// Score model grader.
340    ScoreModel(EvalGraderScoreModel),
341}
342
343/// Label model grader for creating an eval.
344#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
345#[builder(name = "CreateEvalLabelModelGraderArgs")]
346#[builder(pattern = "mutable")]
347#[builder(setter(into, strip_option), default)]
348#[builder(derive(Debug))]
349#[builder(build_fn(error = "OpenAIError"))]
350pub struct CreateEvalLabelModelGrader {
351    /// The name of the grader.
352    pub name: String,
353    /// The model to use for the evaluation. Must support structured outputs.
354    pub model: String,
355    /// A list of chat messages forming the prompt or context. May include variable references to the
356    /// `item` namespace, ie `{{item.name}}`.
357    pub input: Vec<CreateEvalItem>,
358    /// The labels to classify to each item in the evaluation.
359    pub labels: Vec<String>,
360    /// The labels that indicate a passing result. Must be a subset of labels.
361    pub passing_labels: Vec<String>,
362}
363
364#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
365pub struct SimpleInputMessage {
366    /// The role of the message.
367    pub role: String,
368    /// The content of the message.
369    pub content: String,
370}
371
372/// A chat message that makes up the prompt or context.
373#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
374#[serde(tag = "type", rename_all = "snake_case")]
375pub enum CreateEvalItem {
376    /// A message input to the model with a role indicating instruction following
377    /// hierarchy. Instructions given with the `developer` or `system` role take
378    /// precedence over instructions given with the `user` role. Messages with the
379    /// `assistant` role are presumed to have been generated by the model in previous
380    /// interactions.
381    Message(EvalItem),
382
383    /// SimpleInputMessage
384    #[serde(untagged)]
385    Simple(SimpleInputMessage),
386}
387
388/// Request to update an eval.
389#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
390#[builder(name = "UpdateEvalRequestArgs")]
391#[builder(pattern = "mutable")]
392#[builder(setter(into, strip_option), default)]
393#[builder(derive(Debug))]
394#[builder(build_fn(error = "OpenAIError"))]
395pub struct UpdateEvalRequest {
396    /// Rename the evaluation.
397    #[serde(skip_serializing_if = "Option::is_none")]
398    pub name: Option<String>,
399    /// Metadata attached to the eval.
400    #[serde(skip_serializing_if = "Option::is_none")]
401    pub metadata: Option<Metadata>,
402}
403
404/// Response from deleting an eval.
405#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
406pub struct DeleteEvalResponse {
407    /// The object type, which is always "eval.deleted".
408    pub object: String,
409    /// Whether the eval was deleted.
410    pub deleted: bool,
411    /// The ID of the deleted eval.
412    pub eval_id: String,
413}
414
415// EvalRun types
416
417/// A schema representing an evaluation run.
418#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
419pub struct EvalRun {
420    /// The object type, which is always "eval.run".
421    pub object: String,
422    /// Unique identifier for the evaluation run.
423    pub id: String,
424    /// The identifier of the associated evaluation.
425    pub eval_id: String,
426    /// The status of the evaluation run.
427    pub status: EvalRunStatus,
428    /// The model that is evaluated, if applicable.
429    pub model: String,
430    /// The name of the evaluation run.
431    pub name: String,
432    /// Unix timestamp (in seconds) when the evaluation run was created.
433    pub created_at: u64,
434    /// The URL to the rendered evaluation run report on the UI dashboard.
435    pub report_url: String,
436    /// Counters summarizing the outcomes of the evaluation run.
437    pub result_counts: EvalRunResultCounts,
438    /// Usage statistics for each model during the evaluation run.
439    pub per_model_usage: Option<Vec<EvalRunModelUsage>>,
440    /// Results per testing criteria applied during the evaluation run.
441    pub per_testing_criteria_results: Option<Vec<EvalRunTestingCriteriaResult>>,
442    /// Information about the run's data source.
443    pub data_source: EvalRunDataSource,
444    /// Metadata attached to the run.
445    pub metadata: Metadata,
446    /// Error information, if any.
447    pub error: Option<EvalApiError>,
448}
449
450/// Status of an evaluation run.
451#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
452#[serde(rename_all = "snake_case")]
453pub enum EvalRunStatus {
454    /// Queued.
455    Queued,
456    /// In progress.
457    InProgress,
458    /// Completed.
459    Completed,
460    /// Failed.
461    Failed,
462    /// Canceled.
463    Canceled,
464}
465
466/// Counters summarizing the outcomes of the evaluation run.
467#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
468pub struct EvalRunResultCounts {
469    /// Total number of executed output items.
470    pub total: u32,
471    /// Number of output items that resulted in an error.
472    pub errored: u32,
473    /// Number of output items that failed to pass the evaluation.
474    pub failed: u32,
475    /// Number of output items that passed the evaluation.
476    pub passed: u32,
477}
478
479/// Usage statistics for each model during the evaluation run.
480#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
481pub struct EvalRunModelUsage {
482    /// The name of the model.
483    pub model_name: String,
484    /// The number of invocations.
485    pub invocation_count: u32,
486    /// The number of prompt tokens used.
487    pub prompt_tokens: u32,
488    /// The number of completion tokens generated.
489    pub completion_tokens: u32,
490    /// The total number of tokens used.
491    pub total_tokens: u32,
492    /// The number of tokens retrieved from cache.
493    pub cached_tokens: u32,
494}
495
496/// Results per testing criteria applied during the evaluation run.
497#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
498pub struct EvalRunTestingCriteriaResult {
499    /// A description of the testing criteria.
500    pub testing_criteria: String,
501    /// Number of tests passed for this criteria.
502    pub passed: u32,
503    /// Number of tests failed for this criteria.
504    pub failed: u32,
505}
506
507/// Information about the run's data source.
508#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
509#[serde(tag = "type", rename_all = "snake_case")]
510pub enum EvalRunDataSource {
511    /// A JsonlRunDataSource object with that specifies a JSONL file that matches the eval
512    Jsonl(CreateEvalJsonlRunDataSource),
513    /// A CompletionsRunDataSource object describing a model sampling configuration.
514    Completions(CreateEvalCompletionsRunDataSource),
515    /// A ResponsesRunDataSource object describing a model sampling configuration.
516    Responses(CreateEvalResponsesRunDataSource),
517}
518
519/// JSONL run data source.
520#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
521pub struct CreateEvalJsonlRunDataSource {
522    /// Determines what populates the `item` namespace in the data source.
523    pub source: EvalJsonlSource,
524}
525
526/// JSONL source.
527#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
528#[serde(tag = "type", rename_all = "snake_case")]
529pub enum EvalJsonlSource {
530    /// File content source.
531    FileContent(EvalJsonlFileContentSource),
532    /// File ID source.
533    FileId(EvalJsonlFileIdSource),
534}
535
536/// JSONL file content source.
537#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
538pub struct EvalJsonlFileContentSource {
539    /// The content of the jsonl file.
540    pub content: Vec<EvalJsonlContentItem>,
541}
542
543/// JSONL file ID source.
544#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
545pub struct EvalJsonlFileIdSource {
546    /// The identifier of the file.
547    pub id: String,
548}
549
550/// JSONL content item.
551#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
552pub struct EvalJsonlContentItem {
553    /// The item data.
554    pub item: serde_json::Value,
555    /// The sample data, if any.
556    #[serde(skip_serializing_if = "Option::is_none")]
557    pub sample: Option<serde_json::Value>,
558}
559
560/// Completions run data source.
561#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
562pub struct CreateEvalCompletionsRunDataSource {
563    /// Used when sampling from a model. Dictates the structure of the messages passed into the model. Can
564    /// either be a reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template with
565    /// variable references to the `item` namespace.
566    pub input_messages: EvalInputMessages,
567    /// The sampling parameters for the model.
568    #[serde(skip_serializing_if = "Option::is_none")]
569    pub sampling_params: Option<EvalSamplingParams>,
570    /// The name of the model to use for generating completions (e.g. "o3-mini").
571    pub model: String,
572    /// Determines what populates the `item` namespace in this run's data source.
573    pub source: EvalCompletionsSource,
574}
575
576#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
577pub struct TemplateInputMessages {
578    /// A list of chat messages forming the prompt or context. May include variable references to
579    /// the `item` namespace, ie {{item.name}}.
580    pub template: Vec<CreateEvalItem>,
581}
582
583#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
584pub struct ItemReference {
585    /// A reference to a variable in the `item` namespace. Ie, "item.input_trajectory"
586    pub item_reference: String,
587}
588
589/// Input messages for completions.
590#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
591#[serde(tag = "type", rename_all = "snake_case")]
592pub enum EvalInputMessages {
593    /// Template input messages.
594    Template(TemplateInputMessages),
595    /// Item reference input messages.
596    ItemReference(ItemReference),
597}
598
599/// Sampling parameters for the model.
600#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
601pub struct EvalSamplingParams {
602    /// A seed value to initialize the randomness, during sampling.
603    #[serde(skip_serializing_if = "Option::is_none")]
604    pub seed: Option<i32>,
605    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
606    #[serde(skip_serializing_if = "Option::is_none")]
607    pub top_p: Option<f64>,
608    /// A higher temperature increases randomness in the outputs.
609    #[serde(skip_serializing_if = "Option::is_none")]
610    pub temperature: Option<f64>,
611    /// The maximum number of tokens in the generated output.
612    #[serde(skip_serializing_if = "Option::is_none")]
613    pub max_completion_tokens: Option<i32>,
614    /// Optional reasoning effort parameter.
615    #[serde(skip_serializing_if = "Option::is_none")]
616    pub reasoning_effort: Option<ReasoningEffort>,
617    /// An object specifying the format that the model must output.
618    #[serde(skip_serializing_if = "Option::is_none")]
619    pub response_format: Option<ResponseFormat>,
620    /// A list of tools the model may call.
621    #[serde(skip_serializing_if = "Option::is_none")]
622    pub tools: Option<Vec<ChatCompletionTool>>,
623}
624
625#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
626pub struct EvalResponsesSamplingParams {
627    /// A seed value to initialize the randomness, during sampling.
628    #[serde(skip_serializing_if = "Option::is_none")]
629    pub seed: Option<i32>,
630    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
631    #[serde(skip_serializing_if = "Option::is_none")]
632    pub top_p: Option<f64>,
633    /// A higher temperature increases randomness in the outputs.
634    #[serde(skip_serializing_if = "Option::is_none")]
635    pub temperature: Option<f64>,
636    /// The maximum number of tokens in the generated output.
637    #[serde(skip_serializing_if = "Option::is_none")]
638    pub max_completion_tokens: Option<u32>,
639    /// Optional reasoning effort parameter.
640    #[serde(skip_serializing_if = "Option::is_none")]
641    pub reasoning_effort: Option<ReasoningEffort>,
642    /// An object specifying the format that the model must output.
643    #[serde(skip_serializing_if = "Option::is_none")]
644    pub response_format: Option<ResponseFormat>,
645    /// A list of tools the model may call.
646    #[serde(skip_serializing_if = "Option::is_none")]
647    pub tools: Option<Vec<Tool>>,
648    /// Configuration options for a text response from the model. Can be plain
649    /// text or structured JSON data. Learn more:
650    /// - [Text inputs and outputs](https://platform.openai.com/docs/guides/text)
651    /// - [Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs)
652    #[serde(skip_serializing_if = "Option::is_none")]
653    pub text: Option<ResponseTextParam>,
654}
655
656/// Completions source.
657#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
658#[serde(tag = "type", rename_all = "snake_case")]
659pub enum EvalCompletionsSource {
660    /// File content source.
661    FileContent(EvalJsonlFileContentSource),
662    /// File ID source.
663    FileId(EvalJsonlFileIdSource),
664    /// Stored completions source.
665    StoredCompletions(EvalStoredCompletionsSource),
666}
667
668/// Stored completions source.
669#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
670pub struct EvalStoredCompletionsSource {
671    /// Metadata filters for the stored completions.
672    #[serde(skip_serializing_if = "Option::is_none")]
673    pub metadata: Option<Metadata>,
674    /// An optional model to filter by.
675    #[serde(skip_serializing_if = "Option::is_none")]
676    pub model: Option<String>,
677    /// An optional Unix timestamp to filter items created after this time.
678    #[serde(skip_serializing_if = "Option::is_none")]
679    pub created_after: Option<i64>,
680    /// An optional Unix timestamp to filter items created before this time.
681    #[serde(skip_serializing_if = "Option::is_none")]
682    pub created_before: Option<i64>,
683    /// An optional maximum number of items to return.
684    #[serde(skip_serializing_if = "Option::is_none")]
685    pub limit: Option<i32>,
686}
687
688/// Responses run data source.
689#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
690pub struct CreateEvalResponsesRunDataSource {
691    /// Used when sampling from a model. Dictates the structure of the messages passed into the model.
692    #[serde(skip_serializing_if = "Option::is_none")]
693    pub input_messages: Option<EvalInputMessages>,
694    /// The sampling parameters for the model.
695    #[serde(skip_serializing_if = "Option::is_none")]
696    pub sampling_params: Option<EvalResponsesSamplingParams>,
697    #[serde(skip_serializing_if = "Option::is_none")]
698    pub model: Option<String>,
699    /// Determines what populates the `item` namespace in this run's data source.
700    pub source: EvalResponsesRunSource,
701}
702
703/// Responses source.
704#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
705#[serde(tag = "type", rename_all = "snake_case")]
706pub enum EvalResponsesRunSource {
707    /// File content source.
708    FileContent(EvalJsonlFileContentSource),
709    /// File ID source.
710    FileId(EvalJsonlFileIdSource),
711    /// A EvalResponsesSource object describing a run data source configuration.
712    Responses(EvalResponsesSource),
713}
714
715/// A EvalResponsesSource object describing a run data source configuration.
716#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
717pub struct EvalResponsesSource {
718    /// Metadata filter for the responses. This is a query parameter used to select responses.
719    #[serde(skip_serializing_if = "Option::is_none")]
720    pub metadata: Option<serde_json::Value>,
721    /// The name of the model to find responses for. This is a query parameter used to select responses.
722    #[serde(skip_serializing_if = "Option::is_none")]
723    pub model: Option<String>,
724    /// Optional string to search the 'instructions' field. This is a query parameter used to select responses.
725    #[serde(skip_serializing_if = "Option::is_none")]
726    pub instructions_search: Option<String>,
727    /// Only include items created after this timestamp (inclusive). This is a query parameter used to select responses.
728    #[serde(skip_serializing_if = "Option::is_none")]
729    pub created_after: Option<u64>,
730    /// Only include items created before this timestamp (inclusive). This is a query parameter used to select responses.
731    #[serde(skip_serializing_if = "Option::is_none")]
732    pub created_before: Option<u64>,
733    /// Optional reasoning effort parameter. This is a query parameter used to select responses.
734    #[serde(skip_serializing_if = "Option::is_none")]
735    pub reasoning_effort: Option<ReasoningEffort>,
736    /// Sampling temperature. This is a query parameter used to select responses.
737    #[serde(skip_serializing_if = "Option::is_none")]
738    pub temperature: Option<f64>,
739    /// Nucleus sampling parameter. This is a query parameter used to select responses.
740    #[serde(skip_serializing_if = "Option::is_none")]
741    pub top_p: Option<f64>,
742    /// List of user identifiers. This is a query parameter used to select responses.
743    #[serde(skip_serializing_if = "Option::is_none")]
744    pub users: Option<Vec<String>>,
745    /// List of tool names. This is a query parameter used to select responses.
746    #[serde(skip_serializing_if = "Option::is_none")]
747    pub tools: Option<Vec<String>>,
748}
749
750/// List of eval runs.
751#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
752pub struct EvalRunList {
753    /// The object type, which is always "list".
754    pub object: String,
755    /// An array of eval run objects.
756    pub data: Vec<EvalRun>,
757    /// The identifier of the first eval run in the data array.
758    pub first_id: String,
759    /// The identifier of the last eval run in the data array.
760    pub last_id: String,
761    /// Indicates whether there are more evals available.
762    pub has_more: bool,
763}
764
765/// Request to create an eval run.
766#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
767#[builder(name = "CreateEvalRunRequestArgs")]
768#[builder(pattern = "mutable")]
769#[builder(setter(into, strip_option), default)]
770#[builder(derive(Debug))]
771#[builder(build_fn(error = "OpenAIError"))]
772pub struct CreateEvalRunRequest {
773    /// The name of the run.
774    #[serde(skip_serializing_if = "Option::is_none")]
775    pub name: Option<String>,
776    /// Details about the run's data source.
777    pub data_source: CreateEvalRunDataSource,
778    /// Metadata attached to the run.
779    #[serde(skip_serializing_if = "Option::is_none")]
780    pub metadata: Option<Metadata>,
781}
782
783/// Details about the run's data source.
784#[derive(Debug, Serialize, Clone, PartialEq)]
785#[serde(tag = "type", rename_all = "snake_case")]
786pub enum CreateEvalRunDataSource {
787    /// JSONL data source.
788    Jsonl(CreateEvalJsonlRunDataSource),
789    /// Completions data source.
790    Completions(CreateEvalCompletionsRunDataSource),
791    /// Responses data source.
792    Responses(CreateEvalResponsesRunDataSource),
793}
794
795// Manual Default implementation for Builder compatibility
796impl Default for CreateEvalRunDataSource {
797    fn default() -> Self {
798        todo!()
799    }
800}
801
802/// Response from deleting an eval run.
803#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
804pub struct DeleteEvalRunResponse {
805    /// The object type, which is always "eval.run.deleted".
806    pub object: String,
807    /// Whether the eval run was deleted.
808    pub deleted: bool,
809    /// The ID of the deleted eval run.
810    pub run_id: String,
811}
812
813// EvalRunOutputItem types
814
815/// A schema representing an evaluation run output item.
816#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
817pub struct EvalRunOutputItem {
818    /// The object type, which is always "eval.run.output_item".
819    pub object: String,
820    /// Unique identifier for the evaluation run output item.
821    pub id: String,
822    /// The identifier of the evaluation run associated with this output item.
823    pub run_id: String,
824    /// The identifier of the evaluation group.
825    pub eval_id: String,
826    /// Unix timestamp (in seconds) when the evaluation run was created.
827    pub created_at: i64,
828    /// The status of the evaluation run.
829    pub status: String,
830    /// The identifier for the data source item.
831    pub datasource_item_id: u64,
832    /// Details of the input data source item.
833    pub datasource_item: serde_json::Value,
834    /// A list of grader results for this output item.
835    pub results: Vec<EvalRunOutputItemResult>,
836    /// A sample containing the input and output of the evaluation run.
837    pub sample: EvalRunOutputItemSample,
838}
839
840/// A single grader result for an evaluation run output item.
841#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
842pub struct EvalRunOutputItemResult {
843    /// The name of the grader.
844    pub name: String,
845    /// The numeric score produced by the grader.
846    pub score: f64,
847    /// Whether the grader considered the output a pass.
848    pub passed: bool,
849    /// Optional sample or intermediate data produced by the grader.
850    #[serde(skip_serializing_if = "Option::is_none")]
851    pub sample: Option<serde_json::Value>,
852}
853
854#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
855pub struct SimpleOutputMessage {
856    pub role: String,
857    pub content: String,
858}
859
860/// A sample containing the input and output of the evaluation run.
861#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
862pub struct EvalRunOutputItemSample {
863    /// An array of input messages.
864    pub input: Vec<SimpleInputMessage>,
865    /// An array of output messages.
866    pub output: Vec<SimpleOutputMessage>,
867    /// The reason why the sample generation was finished.
868    pub finish_reason: String,
869    /// The model used for generating the sample.
870    pub model: String,
871    /// Token usage details for the sample.
872    pub usage: EvalRunOutputItemUsage,
873    /// Error information, if any.
874    pub error: Option<EvalApiError>,
875    /// The sampling temperature used.
876    pub temperature: f64,
877    /// The maximum number of tokens allowed for completion.
878    pub max_completion_tokens: i32,
879    /// The top_p value used for sampling.
880    pub top_p: f64,
881    /// The seed used for generating the sample.
882    pub seed: i32,
883}
884
885/// Token usage details for the sample.
886#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
887pub struct EvalRunOutputItemUsage {
888    /// The total number of tokens used.
889    pub total_tokens: i32,
890    /// The number of completion tokens generated.
891    pub completion_tokens: i32,
892    /// The number of prompt tokens used.
893    pub prompt_tokens: i32,
894    /// The number of tokens retrieved from cache.
895    pub cached_tokens: i32,
896}
897
898/// List of eval run output items.
899#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
900pub struct EvalRunOutputItemList {
901    /// The object type, which is always "list".
902    pub object: String,
903    /// An array of eval run output item objects.
904    pub data: Vec<EvalRunOutputItem>,
905    /// The identifier of the first eval run output item in the data array.
906    pub first_id: String,
907    /// The identifier of the last eval run output item in the data array.
908    pub last_id: String,
909    /// Indicates whether there are more eval run output items available.
910    pub has_more: bool,
911}
912
913/// An object representing an error response from the Eval API.
914#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
915pub struct EvalApiError {
916    /// The error code.
917    pub code: String,
918    /// The error message.
919    pub message: String,
920}
async_openai/types/evals/eval.rs

async_openai/types/evals/
eval.rs