async_openai/types/evals/
eval.rs

1use derive_builder::Builder;
2use serde::{Deserialize, Serialize};
3
4use crate::error::OpenAIError;
5use crate::types::chat::{ChatCompletionTool, ImageDetail, InputAudio, ResponseFormat};
6use crate::types::graders::{
7    GraderLabelModel, GraderPython, GraderScoreModel, GraderStringCheck, GraderTextSimilarity,
8};
9use crate::types::responses::{ResponseTextParam, Tool};
10use crate::types::Metadata;
11
12// Re-export commonly used types
13pub use crate::types::responses::{EasyInputMessage, InputTextContent, ReasoningEffort};
14
15/// An Eval object with a data source config and testing criteria.
16/// An Eval represents a task to be done for your LLM integration.
17/// Like:
18/// - Improve the quality of my chatbot
19/// - See how well my chatbot handles customer support
20/// - Check if o4-mini is better at my usecase than gpt-4o
21#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
22pub struct Eval {
23    /// The object type, which is always "eval".
24    pub object: String,
25    /// Unique identifier for the evaluation.
26    pub id: String,
27    /// The name of the evaluation.
28    pub name: String,
29    /// Configuration of data sources used in runs of the evaluation.
30    pub data_source_config: EvalDataSourceConfig,
31    /// A list of testing criteria.
32    pub testing_criteria: Vec<EvalTestingCriterion>,
33    /// The Unix timestamp (in seconds) for when the eval was created.
34    pub created_at: u64,
35    pub metadata: Metadata,
36}
37
38/// Configuration of data sources used in runs of the evaluation.
39#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
40#[serde(tag = "type", rename_all = "snake_case")]
41pub enum EvalDataSourceConfig {
42    /// Custom data source config.
43    Custom(EvalCustomDataSourceConfig),
44    /// Logs data source config.
45    Logs(EvalLogsDataSourceConfig),
46    /// Stored completions data source config (deprecated).
47    #[serde(rename = "stored_completions")]
48    StoredCompletions(EvalStoredCompletionsDataSourceConfig),
49}
50
51/// Custom data source config.
52#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
53pub struct EvalCustomDataSourceConfig {
54    /// The type of data source. Always "custom".
55    #[serde(rename = "type")]
56    pub r#type: String,
57    /// The json schema for the run data source items.
58    pub schema: serde_json::Value,
59}
60
61/// Logs data source config.
62#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
63pub struct EvalLogsDataSourceConfig {
64    /// The type of data source. Always "logs".
65    #[serde(rename = "type")]
66    pub r#type: String,
67    /// Metadata filters for the logs data source.
68    #[serde(skip_serializing_if = "Option::is_none")]
69    pub metadata: Option<Metadata>,
70    /// The json schema for the run data source items.
71    pub schema: serde_json::Value,
72}
73
74/// Stored completions data source config (deprecated).
75#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
76pub struct EvalStoredCompletionsDataSourceConfig {
77    /// The type of data source. Always "stored_completions".
78    #[serde(rename = "type")]
79    pub r#type: String,
80    /// Metadata filters for the stored completions data source.
81    #[serde(skip_serializing_if = "Option::is_none")]
82    pub metadata: Option<Metadata>,
83    /// The json schema for the run data source items.
84    pub schema: serde_json::Value,
85}
86
87/// A list of testing criteria.
88#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
89#[serde(tag = "type", rename_all = "snake_case")]
90pub enum EvalTestingCriterion {
91    /// Label model grader.
92    LabelModel(EvalGraderLabelModel),
93    /// String check grader.
94    StringCheck(EvalGraderStringCheck),
95    /// Text similarity grader.
96    TextSimilarity(EvalGraderTextSimilarity),
97    /// Python grader.
98    Python(EvalGraderPython),
99    /// Score model grader.
100    ScoreModel(EvalGraderScoreModel),
101}
102
103/// Label model grader.
104#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
105#[serde(transparent)]
106pub struct EvalGraderLabelModel(pub GraderLabelModel);
107
108/// String check grader.
109#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
110#[serde(transparent)]
111pub struct EvalGraderStringCheck(pub GraderStringCheck);
112
113/// Text similarity grader.
114#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
115pub struct EvalGraderTextSimilarity {
116    #[serde(flatten)]
117    pub grader: GraderTextSimilarity,
118    pub pass_threshold: f64,
119}
120
121/// Text similarity metric.
122#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
123#[serde(rename_all = "snake_case")]
124pub enum TextSimilarityMetric {
125    /// Cosine similarity.
126    Cosine,
127    /// Fuzzy match.
128    FuzzyMatch,
129    /// BLEU score.
130    Bleu,
131    /// GLEU score.
132    Gleu,
133    /// METEOR score.
134    Meteor,
135    /// ROUGE-1.
136    Rouge1,
137    /// ROUGE-2.
138    Rouge2,
139    /// ROUGE-3.
140    Rouge3,
141    /// ROUGE-4.
142    Rouge4,
143    /// ROUGE-5.
144    Rouge5,
145    /// ROUGE-L.
146    RougeL,
147}
148
149/// Python grader.
150/// also in openapi spec: GraderPython
151#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
152pub struct EvalGraderPython {
153    #[serde(flatten)]
154    pub grader: GraderPython,
155    pub pass_threshold: Option<f64>,
156}
157
158#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
159pub struct SamplingParams {
160    /// A seed value to initialize the randomness, during sampling.
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub seed: Option<i32>,
163    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
164    #[serde(skip_serializing_if = "Option::is_none")]
165    pub top_p: Option<f64>,
166    /// A higher temperature increases randomness in the outputs.
167    #[serde(skip_serializing_if = "Option::is_none")]
168    pub temperature: Option<f64>,
169    /// The maximum number of tokens the grader model may generate in its response.
170    #[serde(skip_serializing_if = "Option::is_none")]
171    pub max_completion_tokens: Option<i32>,
172    /// Optional reasoning effort parameter.
173    #[serde(skip_serializing_if = "Option::is_none")]
174    pub reasoning_effort: Option<ReasoningEffort>,
175}
176
177/// Score model grader.
178/// also in openapi spec: GraderScoreModel
179#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
180pub struct EvalGraderScoreModel {
181    #[serde(flatten)]
182    pub grader: GraderScoreModel,
183    /// The threshold for the score.
184    pub pass_threshold: Option<f64>,
185}
186
187#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
188pub struct EvalItem {
189    /// The role of the message input. One of `user`, `assistant`, `system`, or
190    /// `developer`.
191    pub role: EvalItemRole,
192    /// Inputs to the model - can contain template strings. Supports text, output text, input images, and
193    /// input audio, either as a single item or an array of items.
194    pub content: EvalItemContent,
195}
196
197/// The role of the message input.
198#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
199#[serde(rename_all = "lowercase")]
200pub enum EvalItemRole {
201    /// User role.
202    User,
203    /// Assistant role.
204    Assistant,
205    /// System role.
206    System,
207    /// Developer role.
208    Developer,
209}
210
211/// Output text from the model.
212#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
213pub struct EvalItemContentOutputText {
214    /// The text output from the model.
215    pub text: String,
216}
217
218/// Input image block used within EvalItem content arrays.
219#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
220pub struct EvalItemInputImage {
221    /// The URL of the image input.
222    pub image_url: String,
223    /// The detail level of the image to be sent to the model. One of `high`, `low`, or `auto`.
224    /// Defaults to `auto`.
225    #[serde(skip_serializing_if = "Option::is_none")]
226    pub detail: Option<ImageDetail>,
227}
228
229/// Inputs to the model - can contain template strings.
230/// Supports text, output text, input images, and input audio, either as a single item or an array of items.
231#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
232#[serde(untagged)]
233pub enum EvalItemContent {
234    /// An array of Input text, Output text, Input image, and Input audio
235    Array(Vec<EvalItemContentItem>),
236    /// A single content item
237    Single(EvalItemContentItem),
238}
239
240/// A single content item: input text, output text, input image, or input audio.
241#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
242#[serde(tag = "type", rename_all = "snake_case")]
243pub enum EvalItemContentItem {
244    /// An input text content object with type field.
245    InputText(InputTextContent),
246    /// An output text from the model.
247    OutputText(EvalItemContentOutputText),
248    /// An image input to the model.
249    InputImage(EvalItemInputImage),
250    /// An audio input to the model.
251    InputAudio(InputAudio),
252    /// A text input to the model (plain string).
253    #[serde(untagged)]
254    Text(String),
255}
256
257/// List of evals.
258#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
259pub struct EvalList {
260    /// The object type, which is always "list".
261    pub object: String,
262    /// An array of eval objects.
263    pub data: Vec<Eval>,
264    /// The identifier of the first eval in the data array.
265    pub first_id: Option<String>,
266    /// The identifier of the last eval in the data array.
267    pub last_id: Option<String>,
268    /// Indicates whether there are more evals available.
269    pub has_more: bool,
270}
271
272#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
273#[builder(name = "CreateEvalRequestArgs")]
274#[builder(pattern = "mutable")]
275#[builder(setter(into, strip_option), default)]
276#[builder(derive(Debug))]
277#[builder(build_fn(error = "OpenAIError"))]
278pub struct CreateEvalRequest {
279    /// The name of the evaluation.
280    pub name: Option<String>,
281    ///The configuration for the data source used for the evaluation runs.
282    /// Dictates the schema of the data used in the evaluation.
283    pub data_source_config: CreateEvalDataSourceConfig,
284    /// A list of graders for all eval runs in this group. Graders can reference variables in the data
285    /// source using double curly braces notation, like `{{item.variable_name}}`. To reference the model's
286    /// output, use the `sample` namespace (ie, `{{sample.output_text}}`).
287    pub testing_criteria: Vec<CreateEvalTestingCriterion>,
288    #[serde(skip_serializing_if = "Option::is_none")]
289    pub metadata: Option<Metadata>,
290}
291
292#[derive(Debug, Serialize, Clone, PartialEq)]
293#[serde(tag = "type", rename_all = "snake_case")]
294pub enum CreateEvalDataSourceConfig {
295    /// A CustomDataSourceConfig object that defines the schema for the data source used for the evaluation
296    /// runs. This schema is used to define the shape of the data that will be:
297    /// - Used to define your testing criteria and
298    /// - What data is required when creating a run
299    Custom(CreateEvalCustomDataSourceConfig),
300    /// A data source config which specifies the metadata property of your logs query.
301    /// This is usually metadata like `usecase=chatbot` or `prompt-version=v2`, etc.
302    Logs(CreateEvalLogsDataSourceConfig),
303}
304
305impl Default for CreateEvalDataSourceConfig {
306    fn default() -> Self {
307        Self::Custom(CreateEvalCustomDataSourceConfig::default())
308    }
309}
310
311#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
312#[builder(name = "CreateEvalCustomDataSourceConfigArgs")]
313#[builder(pattern = "mutable")]
314#[builder(setter(into, strip_option), default)]
315#[builder(derive(Debug))]
316#[builder(build_fn(error = "OpenAIError"))]
317pub struct CreateEvalCustomDataSourceConfig {
318    /// The json schema for each row in the data source.
319    pub item_schema: serde_json::Value,
320    /// Whether the eval should expect you to populate the sample namespace (ie, by generating responses
321    /// off of your data source).
322    #[serde(skip_serializing_if = "Option::is_none")]
323    pub include_sample_schema: Option<bool>,
324}
325
326/// Logs data source config for creating an eval.
327#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
328#[builder(name = "CreateEvalLogsDataSourceConfigArgs")]
329#[builder(pattern = "mutable")]
330#[builder(setter(into, strip_option), default)]
331#[builder(derive(Debug))]
332#[builder(build_fn(error = "OpenAIError"))]
333pub struct CreateEvalLogsDataSourceConfig {
334    /// Metadata filters for the logs data source.
335    #[serde(skip_serializing_if = "Option::is_none")]
336    pub metadata: Option<Metadata>,
337}
338
339#[derive(Debug, Serialize, Clone, PartialEq)]
340#[serde(tag = "type", rename_all = "snake_case")]
341pub enum CreateEvalTestingCriterion {
342    /// A LabelModelGrader object which uses a model to assign labels to each item
343    /// in the evaluation.
344    LabelModel(CreateEvalLabelModelGrader),
345    /// A StringCheckGrader object that performs a string comparison between input and reference using a
346    /// specified operation.
347    StringCheck(EvalGraderStringCheck),
348    /// Text similarity grader.
349    TextSimilarity(EvalGraderTextSimilarity),
350    /// Python grader.
351    Python(EvalGraderPython),
352    /// Score model grader.
353    ScoreModel(EvalGraderScoreModel),
354}
355
356/// Label model grader for creating an eval.
357#[derive(Debug, Serialize, Clone, PartialEq, Builder, Default)]
358#[builder(name = "CreateEvalLabelModelGraderArgs")]
359#[builder(pattern = "mutable")]
360#[builder(setter(into, strip_option), default)]
361#[builder(derive(Debug))]
362#[builder(build_fn(error = "OpenAIError"))]
363pub struct CreateEvalLabelModelGrader {
364    /// The name of the grader.
365    pub name: String,
366    /// The model to use for the evaluation. Must support structured outputs.
367    pub model: String,
368    /// A list of chat messages forming the prompt or context. May include variable references to the
369    /// `item` namespace, ie `{{item.name}}`.
370    pub input: Vec<CreateEvalItem>,
371    /// The labels to classify to each item in the evaluation.
372    pub labels: Vec<String>,
373    /// The labels that indicate a passing result. Must be a subset of labels.
374    pub passing_labels: Vec<String>,
375}
376
377#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
378pub struct SimpleInputMessage {
379    /// The role of the message.
380    pub role: String,
381    /// The content of the message.
382    pub content: String,
383}
384
385/// A chat message that makes up the prompt or context.
386#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
387#[serde(tag = "type", rename_all = "snake_case")]
388pub enum CreateEvalItem {
389    /// A message input to the model with a role indicating instruction following
390    /// hierarchy. Instructions given with the `developer` or `system` role take
391    /// precedence over instructions given with the `user` role. Messages with the
392    /// `assistant` role are presumed to have been generated by the model in previous
393    /// interactions.
394    Message(EvalItem),
395
396    /// SimpleInputMessage
397    #[serde(untagged)]
398    Simple(SimpleInputMessage),
399}
400
401/// Request to update an eval.
402#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
403#[builder(name = "UpdateEvalRequestArgs")]
404#[builder(pattern = "mutable")]
405#[builder(setter(into, strip_option), default)]
406#[builder(derive(Debug))]
407#[builder(build_fn(error = "OpenAIError"))]
408pub struct UpdateEvalRequest {
409    /// Rename the evaluation.
410    #[serde(skip_serializing_if = "Option::is_none")]
411    pub name: Option<String>,
412    /// Metadata attached to the eval.
413    #[serde(skip_serializing_if = "Option::is_none")]
414    pub metadata: Option<Metadata>,
415}
416
417/// Response from deleting an eval.
418#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
419pub struct DeleteEvalResponse {
420    /// The object type, which is always "eval.deleted".
421    pub object: String,
422    /// Whether the eval was deleted.
423    pub deleted: bool,
424    /// The ID of the deleted eval.
425    pub eval_id: String,
426}
427
428// EvalRun types
429
430/// A schema representing an evaluation run.
431#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
432pub struct EvalRun {
433    /// The object type, which is always "eval.run".
434    pub object: String,
435    /// Unique identifier for the evaluation run.
436    pub id: String,
437    /// The identifier of the associated evaluation.
438    pub eval_id: String,
439    /// The status of the evaluation run.
440    pub status: EvalRunStatus,
441    /// The model that is evaluated, if applicable.
442    pub model: String,
443    /// The name of the evaluation run.
444    pub name: String,
445    /// Unix timestamp (in seconds) when the evaluation run was created.
446    pub created_at: u64,
447    /// The URL to the rendered evaluation run report on the UI dashboard.
448    pub report_url: String,
449    /// Counters summarizing the outcomes of the evaluation run.
450    pub result_counts: EvalRunResultCounts,
451    /// Usage statistics for each model during the evaluation run.
452    pub per_model_usage: Option<Vec<EvalRunModelUsage>>,
453    /// Results per testing criteria applied during the evaluation run.
454    pub per_testing_criteria_results: Option<Vec<EvalRunTestingCriteriaResult>>,
455    /// Information about the run's data source.
456    pub data_source: EvalRunDataSource,
457    /// Metadata attached to the run.
458    pub metadata: Metadata,
459    /// Error information, if any.
460    pub error: Option<EvalApiError>,
461}
462
463/// Status of an evaluation run.
464#[derive(Debug, Deserialize, Serialize, Clone, Copy, PartialEq)]
465#[serde(rename_all = "snake_case")]
466pub enum EvalRunStatus {
467    /// Queued.
468    Queued,
469    /// In progress.
470    InProgress,
471    /// Completed.
472    Completed,
473    /// Failed.
474    Failed,
475    /// Canceled.
476    Canceled,
477}
478
479/// Counters summarizing the outcomes of the evaluation run.
480#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
481pub struct EvalRunResultCounts {
482    /// Total number of executed output items.
483    pub total: u32,
484    /// Number of output items that resulted in an error.
485    pub errored: u32,
486    /// Number of output items that failed to pass the evaluation.
487    pub failed: u32,
488    /// Number of output items that passed the evaluation.
489    pub passed: u32,
490}
491
492/// Usage statistics for each model during the evaluation run.
493#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
494pub struct EvalRunModelUsage {
495    /// The name of the model.
496    pub model_name: String,
497    /// The number of invocations.
498    pub invocation_count: u32,
499    /// The number of prompt tokens used.
500    pub prompt_tokens: u32,
501    /// The number of completion tokens generated.
502    pub completion_tokens: u32,
503    /// The total number of tokens used.
504    pub total_tokens: u32,
505    /// The number of tokens retrieved from cache.
506    pub cached_tokens: u32,
507}
508
509/// Results per testing criteria applied during the evaluation run.
510#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
511pub struct EvalRunTestingCriteriaResult {
512    /// A description of the testing criteria.
513    pub testing_criteria: String,
514    /// Number of tests passed for this criteria.
515    pub passed: u32,
516    /// Number of tests failed for this criteria.
517    pub failed: u32,
518}
519
520/// Information about the run's data source.
521#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
522#[serde(tag = "type", rename_all = "snake_case")]
523pub enum EvalRunDataSource {
524    /// A JsonlRunDataSource object with that specifies a JSONL file that matches the eval
525    Jsonl(CreateEvalJsonlRunDataSource),
526    /// A CompletionsRunDataSource object describing a model sampling configuration.
527    Completions(CreateEvalCompletionsRunDataSource),
528    /// A ResponsesRunDataSource object describing a model sampling configuration.
529    Responses(CreateEvalResponsesRunDataSource),
530}
531
532/// JSONL run data source.
533#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
534pub struct CreateEvalJsonlRunDataSource {
535    /// Determines what populates the `item` namespace in the data source.
536    pub source: EvalJsonlSource,
537}
538
539/// JSONL source.
540#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
541#[serde(tag = "type", rename_all = "snake_case")]
542pub enum EvalJsonlSource {
543    /// File content source.
544    FileContent(EvalJsonlFileContentSource),
545    /// File ID source.
546    FileId(EvalJsonlFileIdSource),
547}
548
549/// JSONL file content source.
550#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
551pub struct EvalJsonlFileContentSource {
552    /// The content of the jsonl file.
553    pub content: Vec<EvalJsonlContentItem>,
554}
555
556/// JSONL file ID source.
557#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
558pub struct EvalJsonlFileIdSource {
559    /// The identifier of the file.
560    pub id: String,
561}
562
563/// JSONL content item.
564#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
565pub struct EvalJsonlContentItem {
566    /// The item data.
567    pub item: serde_json::Value,
568    /// The sample data, if any.
569    #[serde(skip_serializing_if = "Option::is_none")]
570    pub sample: Option<serde_json::Value>,
571}
572
573/// Completions run data source.
574#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
575pub struct CreateEvalCompletionsRunDataSource {
576    /// Used when sampling from a model. Dictates the structure of the messages passed into the model. Can
577    /// either be a reference to a prebuilt trajectory (ie, `item.input_trajectory`), or a template with
578    /// variable references to the `item` namespace.
579    pub input_messages: EvalInputMessages,
580    /// The sampling parameters for the model.
581    #[serde(skip_serializing_if = "Option::is_none")]
582    pub sampling_params: Option<EvalSamplingParams>,
583    /// The name of the model to use for generating completions (e.g. "o3-mini").
584    pub model: String,
585    /// Determines what populates the `item` namespace in this run's data source.
586    pub source: EvalCompletionsSource,
587}
588
589#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
590pub struct TemplateInputMessages {
591    /// A list of chat messages forming the prompt or context. May include variable references to
592    /// the `item` namespace, ie {{item.name}}.
593    pub template: Vec<CreateEvalItem>,
594}
595
596#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
597pub struct ItemReference {
598    /// A reference to a variable in the `item` namespace. Ie, "item.input_trajectory"
599    pub item_reference: String,
600}
601
602/// Input messages for completions.
603#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
604#[serde(tag = "type", rename_all = "snake_case")]
605pub enum EvalInputMessages {
606    /// Template input messages.
607    Template(TemplateInputMessages),
608    /// Item reference input messages.
609    ItemReference(ItemReference),
610}
611
612/// Sampling parameters for the model.
613#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
614pub struct EvalSamplingParams {
615    /// A seed value to initialize the randomness, during sampling.
616    #[serde(skip_serializing_if = "Option::is_none")]
617    pub seed: Option<i32>,
618    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
619    #[serde(skip_serializing_if = "Option::is_none")]
620    pub top_p: Option<f64>,
621    /// A higher temperature increases randomness in the outputs.
622    #[serde(skip_serializing_if = "Option::is_none")]
623    pub temperature: Option<f64>,
624    /// The maximum number of tokens in the generated output.
625    #[serde(skip_serializing_if = "Option::is_none")]
626    pub max_completion_tokens: Option<i32>,
627    /// Optional reasoning effort parameter.
628    #[serde(skip_serializing_if = "Option::is_none")]
629    pub reasoning_effort: Option<ReasoningEffort>,
630    /// An object specifying the format that the model must output.
631    #[serde(skip_serializing_if = "Option::is_none")]
632    pub response_format: Option<ResponseFormat>,
633    /// A list of tools the model may call.
634    #[serde(skip_serializing_if = "Option::is_none")]
635    pub tools: Option<Vec<ChatCompletionTool>>,
636}
637
638#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
639pub struct EvalResponsesSamplingParams {
640    /// A seed value to initialize the randomness, during sampling.
641    #[serde(skip_serializing_if = "Option::is_none")]
642    pub seed: Option<i32>,
643    /// An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
644    #[serde(skip_serializing_if = "Option::is_none")]
645    pub top_p: Option<f64>,
646    /// A higher temperature increases randomness in the outputs.
647    #[serde(skip_serializing_if = "Option::is_none")]
648    pub temperature: Option<f64>,
649    /// The maximum number of tokens in the generated output.
650    #[serde(skip_serializing_if = "Option::is_none")]
651    pub max_completion_tokens: Option<u32>,
652    /// Optional reasoning effort parameter.
653    #[serde(skip_serializing_if = "Option::is_none")]
654    pub reasoning_effort: Option<ReasoningEffort>,
655    /// An object specifying the format that the model must output.
656    #[serde(skip_serializing_if = "Option::is_none")]
657    pub response_format: Option<ResponseFormat>,
658    /// A list of tools the model may call.
659    #[serde(skip_serializing_if = "Option::is_none")]
660    pub tools: Option<Vec<Tool>>,
661    /// Configuration options for a text response from the model. Can be plain
662    /// text or structured JSON data. Learn more:
663    /// - [Text inputs and outputs](https://platform.openai.com/docs/guides/text)
664    /// - [Structured Outputs](https://platform.openai.com/docs/guides/structured-outputs)
665    #[serde(skip_serializing_if = "Option::is_none")]
666    pub text: Option<ResponseTextParam>,
667}
668
669/// Completions source.
670#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
671#[serde(tag = "type", rename_all = "snake_case")]
672pub enum EvalCompletionsSource {
673    /// File content source.
674    FileContent(EvalJsonlFileContentSource),
675    /// File ID source.
676    FileId(EvalJsonlFileIdSource),
677    /// Stored completions source.
678    StoredCompletions(EvalStoredCompletionsSource),
679}
680
681/// Stored completions source.
682#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
683pub struct EvalStoredCompletionsSource {
684    /// Metadata filters for the stored completions.
685    #[serde(skip_serializing_if = "Option::is_none")]
686    pub metadata: Option<Metadata>,
687    /// An optional model to filter by.
688    #[serde(skip_serializing_if = "Option::is_none")]
689    pub model: Option<String>,
690    /// An optional Unix timestamp to filter items created after this time.
691    #[serde(skip_serializing_if = "Option::is_none")]
692    pub created_after: Option<u64>,
693    /// An optional Unix timestamp to filter items created before this time.
694    #[serde(skip_serializing_if = "Option::is_none")]
695    pub created_before: Option<u64>,
696    /// An optional maximum number of items to return.
697    #[serde(skip_serializing_if = "Option::is_none")]
698    pub limit: Option<i32>,
699}
700
701/// Responses run data source.
702#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
703pub struct CreateEvalResponsesRunDataSource {
704    /// Used when sampling from a model. Dictates the structure of the messages passed into the model.
705    #[serde(skip_serializing_if = "Option::is_none")]
706    pub input_messages: Option<EvalInputMessages>,
707    /// The sampling parameters for the model.
708    #[serde(skip_serializing_if = "Option::is_none")]
709    pub sampling_params: Option<EvalResponsesSamplingParams>,
710    #[serde(skip_serializing_if = "Option::is_none")]
711    pub model: Option<String>,
712    /// Determines what populates the `item` namespace in this run's data source.
713    pub source: EvalResponsesRunSource,
714}
715
716/// Responses source.
717#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
718#[serde(tag = "type", rename_all = "snake_case")]
719pub enum EvalResponsesRunSource {
720    /// File content source.
721    FileContent(EvalJsonlFileContentSource),
722    /// File ID source.
723    FileId(EvalJsonlFileIdSource),
724    /// A EvalResponsesSource object describing a run data source configuration.
725    Responses(EvalResponsesSource),
726}
727
728/// A EvalResponsesSource object describing a run data source configuration.
729#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
730pub struct EvalResponsesSource {
731    /// Metadata filter for the responses. This is a query parameter used to select responses.
732    #[serde(skip_serializing_if = "Option::is_none")]
733    pub metadata: Option<serde_json::Value>,
734    /// The name of the model to find responses for. This is a query parameter used to select responses.
735    #[serde(skip_serializing_if = "Option::is_none")]
736    pub model: Option<String>,
737    /// Optional string to search the 'instructions' field. This is a query parameter used to select responses.
738    #[serde(skip_serializing_if = "Option::is_none")]
739    pub instructions_search: Option<String>,
740    /// Only include items created after this timestamp (inclusive). This is a query parameter used to select responses.
741    #[serde(skip_serializing_if = "Option::is_none")]
742    pub created_after: Option<u64>,
743    /// Only include items created before this timestamp (inclusive). This is a query parameter used to select responses.
744    #[serde(skip_serializing_if = "Option::is_none")]
745    pub created_before: Option<u64>,
746    /// Optional reasoning effort parameter. This is a query parameter used to select responses.
747    #[serde(skip_serializing_if = "Option::is_none")]
748    pub reasoning_effort: Option<ReasoningEffort>,
749    /// Sampling temperature. This is a query parameter used to select responses.
750    #[serde(skip_serializing_if = "Option::is_none")]
751    pub temperature: Option<f64>,
752    /// Nucleus sampling parameter. This is a query parameter used to select responses.
753    #[serde(skip_serializing_if = "Option::is_none")]
754    pub top_p: Option<f64>,
755    /// List of user identifiers. This is a query parameter used to select responses.
756    #[serde(skip_serializing_if = "Option::is_none")]
757    pub users: Option<Vec<String>>,
758    /// List of tool names. This is a query parameter used to select responses.
759    #[serde(skip_serializing_if = "Option::is_none")]
760    pub tools: Option<Vec<String>>,
761}
762
763/// List of eval runs.
764#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
765pub struct EvalRunList {
766    /// The object type, which is always "list".
767    pub object: String,
768    /// An array of eval run objects.
769    pub data: Vec<EvalRun>,
770    /// The identifier of the first eval run in the data array.
771    pub first_id: Option<String>,
772    /// The identifier of the last eval run in the data array.
773    pub last_id: Option<String>,
774    /// Indicates whether there are more evals available.
775    pub has_more: bool,
776}
777
778/// Request to create an eval run.
779#[derive(Debug, Serialize, Clone, Builder, PartialEq, Default)]
780#[builder(name = "CreateEvalRunRequestArgs")]
781#[builder(pattern = "mutable")]
782#[builder(setter(into, strip_option), default)]
783#[builder(derive(Debug))]
784#[builder(build_fn(error = "OpenAIError"))]
785pub struct CreateEvalRunRequest {
786    /// The name of the run.
787    #[serde(skip_serializing_if = "Option::is_none")]
788    pub name: Option<String>,
789    /// Details about the run's data source.
790    pub data_source: CreateEvalRunDataSource,
791    /// Metadata attached to the run.
792    #[serde(skip_serializing_if = "Option::is_none")]
793    pub metadata: Option<Metadata>,
794}
795
796/// Details about the run's data source.
797#[derive(Debug, Serialize, Clone, PartialEq)]
798#[serde(tag = "type", rename_all = "snake_case")]
799pub enum CreateEvalRunDataSource {
800    /// JSONL data source.
801    Jsonl(CreateEvalJsonlRunDataSource),
802    /// Completions data source.
803    Completions(CreateEvalCompletionsRunDataSource),
804    /// Responses data source.
805    Responses(CreateEvalResponsesRunDataSource),
806}
807
808// Manual Default implementation for Builder compatibility
809impl Default for CreateEvalRunDataSource {
810    fn default() -> Self {
811        todo!()
812    }
813}
814
815/// Response from deleting an eval run.
816#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
817pub struct DeleteEvalRunResponse {
818    /// The object type, which is always "eval.run.deleted".
819    pub object: String,
820    /// Whether the eval run was deleted.
821    pub deleted: bool,
822    /// The ID of the deleted eval run.
823    pub run_id: String,
824}
825
826// EvalRunOutputItem types
827
828/// A schema representing an evaluation run output item.
829#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
830pub struct EvalRunOutputItem {
831    /// The object type, which is always "eval.run.output_item".
832    pub object: String,
833    /// Unique identifier for the evaluation run output item.
834    pub id: String,
835    /// The identifier of the evaluation run associated with this output item.
836    pub run_id: String,
837    /// The identifier of the evaluation group.
838    pub eval_id: String,
839    /// Unix timestamp (in seconds) when the evaluation run was created.
840    pub created_at: u64,
841    /// The status of the evaluation run.
842    pub status: String,
843    /// The identifier for the data source item.
844    pub datasource_item_id: u64,
845    /// Details of the input data source item.
846    pub datasource_item: serde_json::Value,
847    /// A list of grader results for this output item.
848    pub results: Vec<EvalRunOutputItemResult>,
849    /// A sample containing the input and output of the evaluation run.
850    pub sample: EvalRunOutputItemSample,
851}
852
853/// A single grader result for an evaluation run output item.
854#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
855pub struct EvalRunOutputItemResult {
856    /// The name of the grader.
857    pub name: String,
858    /// The numeric score produced by the grader.
859    pub score: f64,
860    /// Whether the grader considered the output a pass.
861    pub passed: bool,
862    /// Optional sample or intermediate data produced by the grader.
863    #[serde(skip_serializing_if = "Option::is_none")]
864    pub sample: Option<serde_json::Value>,
865}
866
867#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
868pub struct SimpleOutputMessage {
869    pub role: String,
870    pub content: String,
871}
872
873/// A sample containing the input and output of the evaluation run.
874#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
875pub struct EvalRunOutputItemSample {
876    /// An array of input messages.
877    pub input: Vec<SimpleInputMessage>,
878    /// An array of output messages.
879    pub output: Vec<SimpleOutputMessage>,
880    /// The reason why the sample generation was finished.
881    pub finish_reason: String,
882    /// The model used for generating the sample.
883    pub model: String,
884    /// Token usage details for the sample.
885    pub usage: EvalRunOutputItemUsage,
886    /// Error information, if any.
887    pub error: Option<EvalApiError>,
888    /// The sampling temperature used.
889    pub temperature: f64,
890    /// The maximum number of tokens allowed for completion.
891    pub max_completion_tokens: i32,
892    /// The top_p value used for sampling.
893    pub top_p: f64,
894    /// The seed used for generating the sample.
895    pub seed: i32,
896}
897
898/// Token usage details for the sample.
899#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
900pub struct EvalRunOutputItemUsage {
901    /// The total number of tokens used.
902    pub total_tokens: i32,
903    /// The number of completion tokens generated.
904    pub completion_tokens: i32,
905    /// The number of prompt tokens used.
906    pub prompt_tokens: i32,
907    /// The number of tokens retrieved from cache.
908    pub cached_tokens: i32,
909}
910
911/// List of eval run output items.
912#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
913pub struct EvalRunOutputItemList {
914    /// The object type, which is always "list".
915    pub object: String,
916    /// An array of eval run output item objects.
917    pub data: Vec<EvalRunOutputItem>,
918    /// The identifier of the first eval run output item in the data array.
919    pub first_id: Option<String>,
920    /// The identifier of the last eval run output item in the data array.
921    pub last_id: Option<String>,
922    /// Indicates whether there are more eval run output items available.
923    pub has_more: bool,
924}
925
926/// An object representing an error response from the Eval API.
927#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
928pub struct EvalApiError {
929    /// The error code.
930    pub code: String,
931    /// The error message.
932    pub message: String,
933}
async_openai/types/evals/eval.rs

async_openai/types/evals/
eval.rs