Evaluator

Trait Evaluator 

Source
pub trait Evaluator: Module {
    const MAX_CONCURRENCY: usize = 32usize;
    const DISPLAY_PROGRESS: bool = true;

    // Required method
    async fn metric(&self, example: &Example, prediction: &Prediction) -> f32;

    // Provided method
    async fn evaluate(&self, examples: Vec<Example>) -> f32 { ... }
}

Provided Associated Constants§

Required Methods§

Source

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

Source

async fn evaluate(&self, examples: Vec<Example>) -> f32

Examples found in repository?
examples/03-evaluate-hotpotqa.rs (line 78)
64async fn main() -> anyhow::Result<()> {
65    configure(LM::default(), ChatAdapter {});
66
67    let examples = DataLoader::load_hf(
68        "hotpotqa/hotpot_qa",
69        vec!["question".to_string()],
70        vec!["answer".to_string()],
71        "fullwiki",
72        "validation",
73        true,
74    )?[..128]
75        .to_vec();
76
77    let evaluator = QARater::builder().build();
78    let metric = evaluator.evaluate(examples).await;
79
80    println!("Metric: {metric}");
81    Ok(())
82}
More examples
Hide additional examples
examples/08-optimize-mipro.rs (line 117)
84async fn main() -> Result<()> {
85    println!("=== MIPROv2 Optimizer Example ===\n");
86
87    // Configure the LM
88    configure(LM::default(), ChatAdapter);
89
90    // Load training data from HuggingFace
91    println!("Loading training data from HuggingFace...");
92    let train_examples = DataLoader::load_hf(
93        "hotpotqa/hotpot_qa",
94        vec!["question".to_string()],
95        vec!["answer".to_string()],
96        "fullwiki",
97        "validation",
98        true,
99    )?;
100
101    // Use a small subset for faster optimization
102    let train_subset = train_examples[..15].to_vec();
103    println!("Using {} training examples\n", train_subset.len());
104
105    // Create the module
106    let mut qa_module = SimpleQA::builder().build();
107
108    // Show initial instruction
109    println!("Initial instruction:");
110    println!(
111        "  \"{}\"\n",
112        qa_module.answerer.get_signature().instruction()
113    );
114
115    // Test baseline performance
116    println!("Evaluating baseline performance...");
117    let baseline_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
118    println!("Baseline score: {:.3}\n", baseline_score);
119
120    // Create MIPROv2 optimizer
121    let optimizer = MIPROv2::builder()
122        .num_candidates(8) // Generate 8 candidate prompts
123        .num_trials(15) // Run 15 evaluation trials
124        .minibatch_size(10) // Evaluate on 10 examples per candidate
125        .temperature(1.0) // Temperature for prompt generation
126        .track_stats(true) // Display detailed statistics
127        .build();
128
129    // Optimize the module
130    println!("Starting MIPROv2 optimization...");
131    println!("This will:");
132    println!("  1. Generate execution traces");
133    println!("  2. Create a program description using LLM");
134    println!("  3. Generate {} candidate prompts with best practices", 8);
135    println!("  4. Evaluate each candidate");
136    println!("  5. Select and apply the best prompt\n");
137
138    optimizer
139        .compile(&mut qa_module, train_subset.clone())
140        .await?;
141
142    // Show optimized instruction
143    println!("\nOptimized instruction:");
144    println!(
145        "  \"{}\"\n",
146        qa_module.answerer.get_signature().instruction()
147    );
148
149    // Test optimized performance
150    println!("Evaluating optimized performance...");
151    let optimized_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
152    println!("Optimized score: {:.3}", optimized_score);
153
154    // Show improvement
155    let improvement = ((optimized_score - baseline_score) / baseline_score) * 100.0;
156    println!(
157        "\n✓ Improvement: {:.1}% ({:.3} -> {:.3})",
158        improvement, baseline_score, optimized_score
159    );
160
161    // Test on a new example
162    println!("\n--- Testing on a new example ---");
163    let test_example = example! {
164        "question": "input" => "What is the capital of France?",
165    };
166
167    let result = qa_module.forward(test_example).await?;
168    println!("Question: What is the capital of France?");
169    println!("Answer: {}", result.get("answer", None));
170
171    println!("\n=== Example Complete ===");
172    Ok(())
173}
examples/09-gepa-sentiment.rs (line 179)
116async fn main() -> Result<()> {
117    println!("GEPA Sentiment Analysis Optimization Example\n");
118
119    // Setup LM
120    let lm = LM::new(LMConfig {
121        temperature: 0.7,
122        ..LMConfig::default()
123    })
124    .await;
125
126    configure(lm.clone(), ChatAdapter);
127
128    // Create training examples with diverse sentiments
129    let trainset = vec![
130        example! {
131            "text": "input" => "This movie was absolutely fantastic! I loved every minute of it.",
132            "expected_sentiment": "input" => "positive"
133        },
134        example! {
135            "text": "input" => "Terrible service, will never come back again.",
136            "expected_sentiment": "input" => "negative"
137        },
138        example! {
139            "text": "input" => "The weather is okay, nothing special.",
140            "expected_sentiment": "input" => "neutral"
141        },
142        example! {
143            "text": "input" => "Despite some minor issues, I'm quite happy with the purchase.",
144            "expected_sentiment": "input" => "positive"
145        },
146        example! {
147            "text": "input" => "I have mixed feelings about this product.",
148            "expected_sentiment": "input" => "neutral"
149        },
150        example! {
151            "text": "input" => "This is the worst experience I've ever had!",
152            "expected_sentiment": "input" => "negative"
153        },
154        example! {
155            "text": "input" => "It's fine. Does what it's supposed to do.",
156            "expected_sentiment": "input" => "neutral"
157        },
158        example! {
159            "text": "input" => "Exceeded all my expectations! Highly recommend!",
160            "expected_sentiment": "input" => "positive"
161        },
162        example! {
163            "text": "input" => "Disappointed and frustrated with the outcome.",
164            "expected_sentiment": "input" => "negative"
165        },
166        example! {
167            "text": "input" => "Standard quality, nothing remarkable.",
168            "expected_sentiment": "input" => "neutral"
169        },
170    ];
171
172    // Create module
173    let mut module = SentimentAnalyzer::builder()
174        .predictor(Predict::new(SentimentSignature::new()))
175        .build();
176
177    // Evaluate baseline performance
178    println!("Baseline Performance:");
179    let baseline_score = module.evaluate(trainset.clone()).await;
180    println!("  Average score: {:.3}\n", baseline_score);
181
182    // Configure GEPA optimizer
183    let gepa = GEPA::builder()
184        .num_iterations(5)
185        .minibatch_size(5)
186        .num_trials(3)
187        .temperature(0.9)
188        .track_stats(true)
189        .build();
190
191    // Run optimization
192    println!("Starting GEPA optimization...\n");
193    let result = gepa
194        .compile_with_feedback(&mut module, trainset.clone())
195        .await?;
196
197    // Display results
198    println!("\nOptimization Results:");
199    println!(
200        "  Best average score: {:.3}",
201        result.best_candidate.average_score()
202    );
203    println!("  Total rollouts: {}", result.total_rollouts);
204    println!("  Total LM calls: {}", result.total_lm_calls);
205    println!("  Generations: {}", result.evolution_history.len());
206
207    println!("\nBest Instruction:");
208    println!("  {}", result.best_candidate.instruction);
209
210    if !result.evolution_history.is_empty() {
211        println!("\nEvolution History:");
212        for entry in &result.evolution_history {
213            println!("  Generation {}: {:.3}", entry.0, entry.1);
214        }
215    }
216
217    // Test optimized module on a new example
218    println!("\nTesting Optimized Module:");
219    let test_example = example! {
220        "text": "input" => "This product changed my life! Absolutely amazing!",
221        "expected_sentiment": "input" => "positive"
222    };
223
224    let test_prediction = module.forward(test_example.clone()).await?;
225    let test_feedback = module
226        .feedback_metric(&test_example, &test_prediction)
227        .await;
228
229    println!(
230        "  Test prediction: {}",
231        test_prediction.get("sentiment", None)
232    );
233    println!("  Test score: {:.3}", test_feedback.score);
234    println!("  Feedback:\n{}", test_feedback.feedback);
235
236    Ok(())
237}
examples/10-gepa-llm-judge.rs (line 275)
219async fn main() -> Result<()> {
220    println!("GEPA with LLM-as-a-Judge Example\n");
221    println!("This example shows how to use an LLM judge to automatically");
222    println!("generate rich feedback for optimizing a math solver.\n");
223
224    // Setup: Configure the LLM
225    // Main LM for the task
226    let task_lm = LM::new(LMConfig {
227        temperature: 0.7,
228        ..LMConfig::default()
229    })
230    .await;
231
232    // Judge LM (could use a different/cheaper model)
233    let judge_lm = LM::new(LMConfig {
234        temperature: 0.3,
235        ..LMConfig::default()
236    })
237    .await;
238
239    configure(task_lm, ChatAdapter);
240
241    // Create training examples
242    let trainset = vec![
243        example! {
244            "problem": "input" => "Sarah has 12 apples. She gives 3 to her friend and buys 5 more. How many apples does she have now?",
245            "expected_answer": "input" => "14"
246        },
247        example! {
248            "problem": "input" => "A train travels 60 miles in 1 hour. How far will it travel in 3.5 hours at the same speed?",
249            "expected_answer": "input" => "210"
250        },
251        example! {
252            "problem": "input" => "There are 24 students in a class. If 1/3 of them are absent, how many students are present?",
253            "expected_answer": "input" => "16"
254        },
255        example! {
256            "problem": "input" => "A rectangle has length 8 cm and width 5 cm. What is its area?",
257            "expected_answer": "input" => "40"
258        },
259        example! {
260            "problem": "input" => "John has $50. He spends $12 on lunch and $8 on a book. How much money does he have left?",
261            "expected_answer": "input" => "30"
262        },
263    ];
264
265    // Create the module
266    let mut module = MathSolver::builder()
267        .solver(Predict::new(MathWordProblem::new()))
268        .judge(Predict::new(MathJudge::new()))
269        .judge_lm(Arc::new(judge_lm))
270        .build();
271
272    // Evaluate baseline performance
273    println!("Step 1: Baseline Performance");
274    println!("Testing the solver before optimization...\n");
275    let baseline_score = module.evaluate(trainset.clone()).await;
276    println!("  Baseline average score: {:.3}\n", baseline_score);
277
278    // Configure GEPA optimizer
279    println!("Step 2: Configure GEPA");
280    println!("Setting up the optimizer with budget controls...\n");
281
282    let gepa = GEPA::builder()
283        .num_iterations(3) // Fewer iterations for demo
284        .minibatch_size(3) // Smaller batches
285        .temperature(0.9)
286        .track_stats(true)
287        .maybe_max_lm_calls(Some(100)) // Important: we're using 2x LM calls (task + judge)
288        .build();
289
290    // Run GEPA optimization
291    println!("Step 3: Run GEPA Optimization");
292    println!("The judge will analyze reasoning quality and provide feedback...\n");
293
294    let result = gepa
295        .compile_with_feedback(&mut module, trainset.clone())
296        .await?;
297
298    // Display results
299    println!("\nStep 4: Results");
300    println!("===============\n");
301    println!("Optimization complete!");
302    println!(
303        "  Best average score: {:.3}",
304        result.best_candidate.average_score()
305    );
306    println!(
307        "  Improvement: {:.3}",
308        result.best_candidate.average_score() - baseline_score
309    );
310    println!("  Total rollouts: {}", result.total_rollouts);
311    println!(
312        "  Total LM calls: {} (includes judge evaluations)",
313        result.total_lm_calls
314    );
315
316    println!("\nEvolution over time:");
317    for (generation, score) in &result.evolution_history {
318        println!("  Generation {}: {:.3}", generation, score);
319    }
320
321    println!("\nOptimized instruction:");
322    println!("  {}", result.best_candidate.instruction);
323
324    // Test the optimized solver
325    println!("\nStep 5: Test Optimized Solver");
326    println!("==============================\n");
327
328    let test_problem = example! {
329        "problem": "input" => "A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?",
330        "expected_answer": "input" => "2"
331    };
332
333    let test_prediction = module.forward(test_problem.clone()).await?;
334    let test_feedback = module
335        .feedback_metric(&test_problem, &test_prediction)
336        .await;
337
338    println!(
339        "Test problem: A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?"
340    );
341    println!("\nAnswer: {}", test_prediction.get("answer", None));
342    println!("Score: {:.3}\n", test_feedback.score);
343    println!("Detailed Feedback from Judge:");
344    println!("{}", test_feedback.feedback);
345
346    Ok(())
347}

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.

Implementors§