Trait Evaluator

Source

pub trait Evaluator: Module {
    const MAX_CONCURRENCY: usize = 32usize;
    const DISPLAY_PROGRESS: bool = true;

    // Required method
    async fn metric(&self, example: &Example, prediction: &Prediction) -> f32;

    // Provided method
    async fn evaluate(&self, examples: Vec<Example>) -> f32 { ... }
}

Provided Associated Constants§

Source

const MAX_CONCURRENCY: usize = 32usize

Source

const DISPLAY_PROGRESS: bool = true

Required Methods§

Source

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

Source

async fn evaluate(&self, examples: Vec<Example>) -> f32

Examples found in repository ?

examples/03-evaluate-hotpotqa.rs (line 84)

64async fn main() -> anyhow::Result<()> {
65    configure(
66        LM::builder()
67            .model("openai:gpt-4o-mini".to_string())
68            .build()
69            .await?,
70        ChatAdapter {},
71    );
72
73    let examples = DataLoader::load_hf(
74        "hotpotqa/hotpot_qa",
75        vec!["question".to_string()],
76        vec!["answer".to_string()],
77        "fullwiki",
78        "validation",
79        true,
80    )?[..128]
81        .to_vec();
82
83    let evaluator = QARater::builder().build();
84    let metric = evaluator.evaluate(examples).await;
85
86    println!("Metric: {metric}");
87    Ok(())
88}

More examples

Hide additional examples

examples/08-optimize-mipro.rs (line 117)

84async fn main() -> Result<()> {
85    println!("=== MIPROv2 Optimizer Example ===\n");
86
87    // Configure the LM
88    configure(LM::default(), ChatAdapter);
89
90    // Load training data from HuggingFace
91    println!("Loading training data from HuggingFace...");
92    let train_examples = DataLoader::load_hf(
93        "hotpotqa/hotpot_qa",
94        vec!["question".to_string()],
95        vec!["answer".to_string()],
96        "fullwiki",
97        "validation",
98        true,
99    )?;
100
101    // Use a small subset for faster optimization
102    let train_subset = train_examples[..15].to_vec();
103    println!("Using {} training examples\n", train_subset.len());
104
105    // Create the module
106    let mut qa_module = SimpleQA::builder().build();
107
108    // Show initial instruction
109    println!("Initial instruction:");
110    println!(
111        "  \"{}\"\n",
112        qa_module.answerer.get_signature().instruction()
113    );
114
115    // Test baseline performance
116    println!("Evaluating baseline performance...");
117    let baseline_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
118    println!("Baseline score: {:.3}\n", baseline_score);
119
120    // Create MIPROv2 optimizer
121    let optimizer = MIPROv2::builder()
122        .num_candidates(8) // Generate 8 candidate prompts
123        .num_trials(15) // Run 15 evaluation trials
124        .minibatch_size(10) // Evaluate on 10 examples per candidate
125        .temperature(1.0) // Temperature for prompt generation
126        .track_stats(true) // Display detailed statistics
127        .build();
128
129    // Optimize the module
130    println!("Starting MIPROv2 optimization...");
131    println!("This will:");
132    println!("  1. Generate execution traces");
133    println!("  2. Create a program description using LLM");
134    println!("  3. Generate {} candidate prompts with best practices", 8);
135    println!("  4. Evaluate each candidate");
136    println!("  5. Select and apply the best prompt\n");
137
138    optimizer
139        .compile(&mut qa_module, train_subset.clone())
140        .await?;
141
142    // Show optimized instruction
143    println!("\nOptimized instruction:");
144    println!(
145        "  \"{}\"\n",
146        qa_module.answerer.get_signature().instruction()
147    );
148
149    // Test optimized performance
150    println!("Evaluating optimized performance...");
151    let optimized_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
152    println!("Optimized score: {:.3}", optimized_score);
153
154    // Show improvement
155    let improvement = ((optimized_score - baseline_score) / baseline_score) * 100.0;
156    println!(
157        "\n✓ Improvement: {:.1}% ({:.3} -> {:.3})",
158        improvement, baseline_score, optimized_score
159    );
160
161    // Test on a new example
162    println!("\n--- Testing on a new example ---");
163    let test_example = example! {
164        "question": "input" => "What is the capital of France?",
165    };
166
167    let result = qa_module.forward(test_example).await?;
168    println!("Question: What is the capital of France?");
169    println!("Answer: {}", result.get("answer", None));
170
171    println!("\n=== Example Complete ===");
172    Ok(())
173}

examples/09-gepa-sentiment.rs (line 175)

116async fn main() -> Result<()> {
117    println!("GEPA Sentiment Analysis Optimization Example\n");
118
119    // Setup LM
120    let lm = LM::builder().temperature(0.7).build().await.unwrap();
121
122    configure(lm.clone(), ChatAdapter);
123
124    // Create training examples with diverse sentiments
125    let trainset = vec![
126        example! {
127            "text": "input" => "This movie was absolutely fantastic! I loved every minute of it.",
128            "expected_sentiment": "input" => "positive"
129        },
130        example! {
131            "text": "input" => "Terrible service, will never come back again.",
132            "expected_sentiment": "input" => "negative"
133        },
134        example! {
135            "text": "input" => "The weather is okay, nothing special.",
136            "expected_sentiment": "input" => "neutral"
137        },
138        example! {
139            "text": "input" => "Despite some minor issues, I'm quite happy with the purchase.",
140            "expected_sentiment": "input" => "positive"
141        },
142        example! {
143            "text": "input" => "I have mixed feelings about this product.",
144            "expected_sentiment": "input" => "neutral"
145        },
146        example! {
147            "text": "input" => "This is the worst experience I've ever had!",
148            "expected_sentiment": "input" => "negative"
149        },
150        example! {
151            "text": "input" => "It's fine. Does what it's supposed to do.",
152            "expected_sentiment": "input" => "neutral"
153        },
154        example! {
155            "text": "input" => "Exceeded all my expectations! Highly recommend!",
156            "expected_sentiment": "input" => "positive"
157        },
158        example! {
159            "text": "input" => "Disappointed and frustrated with the outcome.",
160            "expected_sentiment": "input" => "negative"
161        },
162        example! {
163            "text": "input" => "Standard quality, nothing remarkable.",
164            "expected_sentiment": "input" => "neutral"
165        },
166    ];
167
168    // Create module
169    let mut module = SentimentAnalyzer::builder()
170        .predictor(Predict::new(SentimentSignature::new()))
171        .build();
172
173    // Evaluate baseline performance
174    println!("Baseline Performance:");
175    let baseline_score = module.evaluate(trainset.clone()).await;
176    println!("  Average score: {:.3}\n", baseline_score);
177
178    // Configure GEPA optimizer
179    let gepa = GEPA::builder()
180        .num_iterations(5)
181        .minibatch_size(5)
182        .num_trials(3)
183        .temperature(0.9)
184        .track_stats(true)
185        .build();
186
187    // Run optimization
188    println!("Starting GEPA optimization...\n");
189    let result = gepa
190        .compile_with_feedback(&mut module, trainset.clone())
191        .await?;
192
193    // Display results
194    println!("\nOptimization Results:");
195    println!(
196        "  Best average score: {:.3}",
197        result.best_candidate.average_score()
198    );
199    println!("  Total rollouts: {}", result.total_rollouts);
200    println!("  Total LM calls: {}", result.total_lm_calls);
201    println!("  Generations: {}", result.evolution_history.len());
202
203    println!("\nBest Instruction:");
204    println!("  {}", result.best_candidate.instruction);
205
206    if !result.evolution_history.is_empty() {
207        println!("\nEvolution History:");
208        for entry in &result.evolution_history {
209            println!("  Generation {}: {:.3}", entry.0, entry.1);
210        }
211    }
212
213    // Test optimized module on a new example
214    println!("\nTesting Optimized Module:");
215    let test_example = example! {
216        "text": "input" => "This product changed my life! Absolutely amazing!",
217        "expected_sentiment": "input" => "positive"
218    };
219
220    let test_prediction = module.forward(test_example.clone()).await?;
221    let test_feedback = module
222        .feedback_metric(&test_example, &test_prediction)
223        .await;
224
225    println!(
226        "  Test prediction: {}",
227        test_prediction.get("sentiment", None)
228    );
229    println!("  Test score: {:.3}", test_feedback.score);
230    println!("  Feedback:\n{}", test_feedback.feedback);
231
232    Ok(())
233}

examples/10-gepa-llm-judge.rs (line 267)

219async fn main() -> Result<()> {
220    println!("GEPA with LLM-as-a-Judge Example\n");
221    println!("This example shows how to use an LLM judge to automatically");
222    println!("generate rich feedback for optimizing a math solver.\n");
223
224    // Setup: Configure the LLM
225    // Main LM for the task
226    let task_lm = LM::builder().temperature(0.7).build().await.unwrap();
227
228    // Judge LM (could use a different/cheaper model)
229    let judge_lm = LM::builder().temperature(0.3).build().await.unwrap();
230
231    configure(task_lm, ChatAdapter);
232
233    // Create training examples
234    let trainset = vec![
235        example! {
236            "problem": "input" => "Sarah has 12 apples. She gives 3 to her friend and buys 5 more. How many apples does she have now?",
237            "expected_answer": "input" => "14"
238        },
239        example! {
240            "problem": "input" => "A train travels 60 miles in 1 hour. How far will it travel in 3.5 hours at the same speed?",
241            "expected_answer": "input" => "210"
242        },
243        example! {
244            "problem": "input" => "There are 24 students in a class. If 1/3 of them are absent, how many students are present?",
245            "expected_answer": "input" => "16"
246        },
247        example! {
248            "problem": "input" => "A rectangle has length 8 cm and width 5 cm. What is its area?",
249            "expected_answer": "input" => "40"
250        },
251        example! {
252            "problem": "input" => "John has $50. He spends $12 on lunch and $8 on a book. How much money does he have left?",
253            "expected_answer": "input" => "30"
254        },
255    ];
256
257    // Create the module
258    let mut module = MathSolver::builder()
259        .solver(Predict::new(MathWordProblem::new()))
260        .judge(Predict::new(MathJudge::new()))
261        .judge_lm(Arc::new(judge_lm))
262        .build();
263
264    // Evaluate baseline performance
265    println!("Step 1: Baseline Performance");
266    println!("Testing the solver before optimization...\n");
267    let baseline_score = module.evaluate(trainset.clone()).await;
268    println!("  Baseline average score: {:.3}\n", baseline_score);
269
270    // Configure GEPA optimizer
271    println!("Step 2: Configure GEPA");
272    println!("Setting up the optimizer with budget controls...\n");
273
274    let gepa = GEPA::builder()
275        .num_iterations(3) // Fewer iterations for demo
276        .minibatch_size(3) // Smaller batches
277        .temperature(0.9)
278        .track_stats(true)
279        .maybe_max_lm_calls(Some(100)) // Important: we're using 2x LM calls (task + judge)
280        .build();
281
282    // Run GEPA optimization
283    println!("Step 3: Run GEPA Optimization");
284    println!("The judge will analyze reasoning quality and provide feedback...\n");
285
286    let result = gepa
287        .compile_with_feedback(&mut module, trainset.clone())
288        .await?;
289
290    // Display results
291    println!("\nStep 4: Results");
292    println!("===============\n");
293    println!("Optimization complete!");
294    println!(
295        "  Best average score: {:.3}",
296        result.best_candidate.average_score()
297    );
298    println!(
299        "  Improvement: {:.3}",
300        result.best_candidate.average_score() - baseline_score
301    );
302    println!("  Total rollouts: {}", result.total_rollouts);
303    println!(
304        "  Total LM calls: {} (includes judge evaluations)",
305        result.total_lm_calls
306    );
307
308    println!("\nEvolution over time:");
309    for (generation, score) in &result.evolution_history {
310        println!("  Generation {}: {:.3}", generation, score);
311    }
312
313    println!("\nOptimized instruction:");
314    println!("  {}", result.best_candidate.instruction);
315
316    // Test the optimized solver
317    println!("\nStep 5: Test Optimized Solver");
318    println!("==============================\n");
319
320    let test_problem = example! {
321        "problem": "input" => "A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?",
322        "expected_answer": "input" => "2"
323    };
324
325    let test_prediction = module.forward(test_problem.clone()).await?;
326    let test_feedback = module
327        .feedback_metric(&test_problem, &test_prediction)
328        .await;
329
330    println!(
331        "Test problem: A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?"
332    );
333    println!("\nAnswer: {}", test_prediction.get("answer", None));
334    println!("Score: {:.3}\n", test_feedback.score);
335    println!("Detailed Feedback from Judge:");
336    println!("{}", test_feedback.feedback);
337
338    Ok(())
339}

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.

Trait Evaluator Copy item path

Provided Associated Constants§

const MAX_CONCURRENCY: usize = 32usize

const DISPLAY_PROGRESS: bool = true

Required Methods§

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

async fn evaluate(&self, examples: Vec<Example>) -> f32

Dyn Compatibility§

Implementors§

Trait Evaluator