Trait Evaluator

Source

pub trait Evaluator: Module {
    const MAX_CONCURRENCY: usize = 32usize;
    const DISPLAY_PROGRESS: bool = true;

    // Required method
    async fn metric(&self, example: &Example, prediction: &Prediction) -> f32;

    // Provided method
    async fn evaluate(&self, examples: Vec<Example>) -> f32 { ... }
}

Provided Associated Constants§

Source

const MAX_CONCURRENCY: usize = 32usize

Source

const DISPLAY_PROGRESS: bool = true

Required Methods§

Source

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

Source

async fn evaluate(&self, examples: Vec<Example>) -> f32

Examples found in repository ?

examples/03-evaluate-hotpotqa.rs (line 78)

64async fn main() -> anyhow::Result<()> {
65    configure(LM::default(), ChatAdapter {});
66
67    let examples = DataLoader::load_hf(
68        "hotpotqa/hotpot_qa",
69        vec!["question".to_string()],
70        vec!["answer".to_string()],
71        "fullwiki",
72        "validation",
73        true,
74    )?[..128]
75        .to_vec();
76
77    let evaluator = QARater::builder().build();
78    let metric = evaluator.evaluate(examples).await;
79
80    println!("Metric: {metric}");
81    Ok(())
82}

More examples

Hide additional examples

examples/08-optimize-mipro.rs (line 117)

84async fn main() -> Result<()> {
85    println!("=== MIPROv2 Optimizer Example ===\n");
86
87    // Configure the LM
88    configure(LM::default(), ChatAdapter);
89
90    // Load training data from HuggingFace
91    println!("Loading training data from HuggingFace...");
92    let train_examples = DataLoader::load_hf(
93        "hotpotqa/hotpot_qa",
94        vec!["question".to_string()],
95        vec!["answer".to_string()],
96        "fullwiki",
97        "validation",
98        true,
99    )?;
100
101    // Use a small subset for faster optimization
102    let train_subset = train_examples[..15].to_vec();
103    println!("Using {} training examples\n", train_subset.len());
104
105    // Create the module
106    let mut qa_module = SimpleQA::builder().build();
107
108    // Show initial instruction
109    println!("Initial instruction:");
110    println!(
111        "  \"{}\"\n",
112        qa_module.answerer.get_signature().instruction()
113    );
114
115    // Test baseline performance
116    println!("Evaluating baseline performance...");
117    let baseline_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
118    println!("Baseline score: {:.3}\n", baseline_score);
119
120    // Create MIPROv2 optimizer
121    let optimizer = MIPROv2::builder()
122        .num_candidates(8) // Generate 8 candidate prompts
123        .num_trials(15) // Run 15 evaluation trials
124        .minibatch_size(10) // Evaluate on 10 examples per candidate
125        .temperature(1.0) // Temperature for prompt generation
126        .track_stats(true) // Display detailed statistics
127        .build();
128
129    // Optimize the module
130    println!("Starting MIPROv2 optimization...");
131    println!("This will:");
132    println!("  1. Generate execution traces");
133    println!("  2. Create a program description using LLM");
134    println!("  3. Generate {} candidate prompts with best practices", 8);
135    println!("  4. Evaluate each candidate");
136    println!("  5. Select and apply the best prompt\n");
137
138    optimizer
139        .compile(&mut qa_module, train_subset.clone())
140        .await?;
141
142    // Show optimized instruction
143    println!("\nOptimized instruction:");
144    println!(
145        "  \"{}\"\n",
146        qa_module.answerer.get_signature().instruction()
147    );
148
149    // Test optimized performance
150    println!("Evaluating optimized performance...");
151    let optimized_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
152    println!("Optimized score: {:.3}", optimized_score);
153
154    // Show improvement
155    let improvement = ((optimized_score - baseline_score) / baseline_score) * 100.0;
156    println!(
157        "\n✓ Improvement: {:.1}% ({:.3} -> {:.3})",
158        improvement, baseline_score, optimized_score
159    );
160
161    // Test on a new example
162    println!("\n--- Testing on a new example ---");
163    let test_example = example! {
164        "question": "input" => "What is the capital of France?",
165    };
166
167    let result = qa_module.forward(test_example).await?;
168    println!("Question: What is the capital of France?");
169    println!("Answer: {}", result.get("answer", None));
170
171    println!("\n=== Example Complete ===");
172    Ok(())
173}

examples/09-gepa-sentiment.rs (line 179)

116async fn main() -> Result<()> {
117    println!("GEPA Sentiment Analysis Optimization Example\n");
118
119    // Setup LM
120    let lm = LM::new(LMConfig {
121        temperature: 0.7,
122        ..LMConfig::default()
123    })
124    .await;
125
126    configure(lm.clone(), ChatAdapter);
127
128    // Create training examples with diverse sentiments
129    let trainset = vec![
130        example! {
131            "text": "input" => "This movie was absolutely fantastic! I loved every minute of it.",
132            "expected_sentiment": "input" => "positive"
133        },
134        example! {
135            "text": "input" => "Terrible service, will never come back again.",
136            "expected_sentiment": "input" => "negative"
137        },
138        example! {
139            "text": "input" => "The weather is okay, nothing special.",
140            "expected_sentiment": "input" => "neutral"
141        },
142        example! {
143            "text": "input" => "Despite some minor issues, I'm quite happy with the purchase.",
144            "expected_sentiment": "input" => "positive"
145        },
146        example! {
147            "text": "input" => "I have mixed feelings about this product.",
148            "expected_sentiment": "input" => "neutral"
149        },
150        example! {
151            "text": "input" => "This is the worst experience I've ever had!",
152            "expected_sentiment": "input" => "negative"
153        },
154        example! {
155            "text": "input" => "It's fine. Does what it's supposed to do.",
156            "expected_sentiment": "input" => "neutral"
157        },
158        example! {
159            "text": "input" => "Exceeded all my expectations! Highly recommend!",
160            "expected_sentiment": "input" => "positive"
161        },
162        example! {
163            "text": "input" => "Disappointed and frustrated with the outcome.",
164            "expected_sentiment": "input" => "negative"
165        },
166        example! {
167            "text": "input" => "Standard quality, nothing remarkable.",
168            "expected_sentiment": "input" => "neutral"
169        },
170    ];
171
172    // Create module
173    let mut module = SentimentAnalyzer::builder()
174        .predictor(Predict::new(SentimentSignature::new()))
175        .build();
176
177    // Evaluate baseline performance
178    println!("Baseline Performance:");
179    let baseline_score = module.evaluate(trainset.clone()).await;
180    println!("  Average score: {:.3}\n", baseline_score);
181
182    // Configure GEPA optimizer
183    let gepa = GEPA::builder()
184        .num_iterations(5)
185        .minibatch_size(5)
186        .num_trials(3)
187        .temperature(0.9)
188        .track_stats(true)
189        .build();
190
191    // Run optimization
192    println!("Starting GEPA optimization...\n");
193    let result = gepa
194        .compile_with_feedback(&mut module, trainset.clone())
195        .await?;
196
197    // Display results
198    println!("\nOptimization Results:");
199    println!(
200        "  Best average score: {:.3}",
201        result.best_candidate.average_score()
202    );
203    println!("  Total rollouts: {}", result.total_rollouts);
204    println!("  Total LM calls: {}", result.total_lm_calls);
205    println!("  Generations: {}", result.evolution_history.len());
206
207    println!("\nBest Instruction:");
208    println!("  {}", result.best_candidate.instruction);
209
210    if !result.evolution_history.is_empty() {
211        println!("\nEvolution History:");
212        for entry in &result.evolution_history {
213            println!("  Generation {}: {:.3}", entry.0, entry.1);
214        }
215    }
216
217    // Test optimized module on a new example
218    println!("\nTesting Optimized Module:");
219    let test_example = example! {
220        "text": "input" => "This product changed my life! Absolutely amazing!",
221        "expected_sentiment": "input" => "positive"
222    };
223
224    let test_prediction = module.forward(test_example.clone()).await?;
225    let test_feedback = module
226        .feedback_metric(&test_example, &test_prediction)
227        .await;
228
229    println!(
230        "  Test prediction: {}",
231        test_prediction.get("sentiment", None)
232    );
233    println!("  Test score: {:.3}", test_feedback.score);
234    println!("  Feedback:\n{}", test_feedback.feedback);
235
236    Ok(())
237}

examples/10-gepa-llm-judge.rs (line 275)

219async fn main() -> Result<()> {
220    println!("GEPA with LLM-as-a-Judge Example\n");
221    println!("This example shows how to use an LLM judge to automatically");
222    println!("generate rich feedback for optimizing a math solver.\n");
223
224    // Setup: Configure the LLM
225    // Main LM for the task
226    let task_lm = LM::new(LMConfig {
227        temperature: 0.7,
228        ..LMConfig::default()
229    })
230    .await;
231
232    // Judge LM (could use a different/cheaper model)
233    let judge_lm = LM::new(LMConfig {
234        temperature: 0.3,
235        ..LMConfig::default()
236    })
237    .await;
238
239    configure(task_lm, ChatAdapter);
240
241    // Create training examples
242    let trainset = vec![
243        example! {
244            "problem": "input" => "Sarah has 12 apples. She gives 3 to her friend and buys 5 more. How many apples does she have now?",
245            "expected_answer": "input" => "14"
246        },
247        example! {
248            "problem": "input" => "A train travels 60 miles in 1 hour. How far will it travel in 3.5 hours at the same speed?",
249            "expected_answer": "input" => "210"
250        },
251        example! {
252            "problem": "input" => "There are 24 students in a class. If 1/3 of them are absent, how many students are present?",
253            "expected_answer": "input" => "16"
254        },
255        example! {
256            "problem": "input" => "A rectangle has length 8 cm and width 5 cm. What is its area?",
257            "expected_answer": "input" => "40"
258        },
259        example! {
260            "problem": "input" => "John has $50. He spends $12 on lunch and $8 on a book. How much money does he have left?",
261            "expected_answer": "input" => "30"
262        },
263    ];
264
265    // Create the module
266    let mut module = MathSolver::builder()
267        .solver(Predict::new(MathWordProblem::new()))
268        .judge(Predict::new(MathJudge::new()))
269        .judge_lm(Arc::new(judge_lm))
270        .build();
271
272    // Evaluate baseline performance
273    println!("Step 1: Baseline Performance");
274    println!("Testing the solver before optimization...\n");
275    let baseline_score = module.evaluate(trainset.clone()).await;
276    println!("  Baseline average score: {:.3}\n", baseline_score);
277
278    // Configure GEPA optimizer
279    println!("Step 2: Configure GEPA");
280    println!("Setting up the optimizer with budget controls...\n");
281
282    let gepa = GEPA::builder()
283        .num_iterations(3) // Fewer iterations for demo
284        .minibatch_size(3) // Smaller batches
285        .temperature(0.9)
286        .track_stats(true)
287        .maybe_max_lm_calls(Some(100)) // Important: we're using 2x LM calls (task + judge)
288        .build();
289
290    // Run GEPA optimization
291    println!("Step 3: Run GEPA Optimization");
292    println!("The judge will analyze reasoning quality and provide feedback...\n");
293
294    let result = gepa
295        .compile_with_feedback(&mut module, trainset.clone())
296        .await?;
297
298    // Display results
299    println!("\nStep 4: Results");
300    println!("===============\n");
301    println!("Optimization complete!");
302    println!(
303        "  Best average score: {:.3}",
304        result.best_candidate.average_score()
305    );
306    println!(
307        "  Improvement: {:.3}",
308        result.best_candidate.average_score() - baseline_score
309    );
310    println!("  Total rollouts: {}", result.total_rollouts);
311    println!(
312        "  Total LM calls: {} (includes judge evaluations)",
313        result.total_lm_calls
314    );
315
316    println!("\nEvolution over time:");
317    for (generation, score) in &result.evolution_history {
318        println!("  Generation {}: {:.3}", generation, score);
319    }
320
321    println!("\nOptimized instruction:");
322    println!("  {}", result.best_candidate.instruction);
323
324    // Test the optimized solver
325    println!("\nStep 5: Test Optimized Solver");
326    println!("==============================\n");
327
328    let test_problem = example! {
329        "problem": "input" => "A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?",
330        "expected_answer": "input" => "2"
331    };
332
333    let test_prediction = module.forward(test_problem.clone()).await?;
334    let test_feedback = module
335        .feedback_metric(&test_problem, &test_prediction)
336        .await;
337
338    println!(
339        "Test problem: A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?"
340    );
341    println!("\nAnswer: {}", test_prediction.get("answer", None));
342    println!("Score: {:.3}\n", test_feedback.score);
343    println!("Detailed Feedback from Judge:");
344    println!("{}", test_feedback.feedback);
345
346    Ok(())
347}

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.

Trait Evaluator Copy item path

Provided Associated Constants§

const MAX_CONCURRENCY: usize = 32usize

const DISPLAY_PROGRESS: bool = true

Required Methods§

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

async fn evaluate(&self, examples: Vec<Example>) -> f32

Dyn Compatibility§

Implementors§

Trait Evaluator