Evaluator

Trait Evaluator 

Source
pub trait Evaluator: Module {
    const MAX_CONCURRENCY: usize = 32usize;
    const DISPLAY_PROGRESS: bool = true;

    // Required method
    async fn metric(&self, example: &Example, prediction: &Prediction) -> f32;

    // Provided method
    async fn evaluate(&self, examples: Vec<Example>) -> f32 { ... }
}

Provided Associated Constants§

Required Methods§

Source

async fn metric(&self, example: &Example, prediction: &Prediction) -> f32

Provided Methods§

Source

async fn evaluate(&self, examples: Vec<Example>) -> f32

Examples found in repository?
examples/03-evaluate-hotpotqa.rs (line 84)
64async fn main() -> anyhow::Result<()> {
65    configure(
66        LM::builder()
67            .model("openai:gpt-4o-mini".to_string())
68            .build()
69            .await?,
70        ChatAdapter {},
71    );
72
73    let examples = DataLoader::load_hf(
74        "hotpotqa/hotpot_qa",
75        vec!["question".to_string()],
76        vec!["answer".to_string()],
77        "fullwiki",
78        "validation",
79        true,
80    )?[..128]
81        .to_vec();
82
83    let evaluator = QARater::builder().build();
84    let metric = evaluator.evaluate(examples).await;
85
86    println!("Metric: {metric}");
87    Ok(())
88}
More examples
Hide additional examples
examples/08-optimize-mipro.rs (line 117)
84async fn main() -> Result<()> {
85    println!("=== MIPROv2 Optimizer Example ===\n");
86
87    // Configure the LM
88    configure(LM::default(), ChatAdapter);
89
90    // Load training data from HuggingFace
91    println!("Loading training data from HuggingFace...");
92    let train_examples = DataLoader::load_hf(
93        "hotpotqa/hotpot_qa",
94        vec!["question".to_string()],
95        vec!["answer".to_string()],
96        "fullwiki",
97        "validation",
98        true,
99    )?;
100
101    // Use a small subset for faster optimization
102    let train_subset = train_examples[..15].to_vec();
103    println!("Using {} training examples\n", train_subset.len());
104
105    // Create the module
106    let mut qa_module = SimpleQA::builder().build();
107
108    // Show initial instruction
109    println!("Initial instruction:");
110    println!(
111        "  \"{}\"\n",
112        qa_module.answerer.get_signature().instruction()
113    );
114
115    // Test baseline performance
116    println!("Evaluating baseline performance...");
117    let baseline_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
118    println!("Baseline score: {:.3}\n", baseline_score);
119
120    // Create MIPROv2 optimizer
121    let optimizer = MIPROv2::builder()
122        .num_candidates(8) // Generate 8 candidate prompts
123        .num_trials(15) // Run 15 evaluation trials
124        .minibatch_size(10) // Evaluate on 10 examples per candidate
125        .temperature(1.0) // Temperature for prompt generation
126        .track_stats(true) // Display detailed statistics
127        .build();
128
129    // Optimize the module
130    println!("Starting MIPROv2 optimization...");
131    println!("This will:");
132    println!("  1. Generate execution traces");
133    println!("  2. Create a program description using LLM");
134    println!("  3. Generate {} candidate prompts with best practices", 8);
135    println!("  4. Evaluate each candidate");
136    println!("  5. Select and apply the best prompt\n");
137
138    optimizer
139        .compile(&mut qa_module, train_subset.clone())
140        .await?;
141
142    // Show optimized instruction
143    println!("\nOptimized instruction:");
144    println!(
145        "  \"{}\"\n",
146        qa_module.answerer.get_signature().instruction()
147    );
148
149    // Test optimized performance
150    println!("Evaluating optimized performance...");
151    let optimized_score = qa_module.evaluate(train_subset[..5].to_vec()).await;
152    println!("Optimized score: {:.3}", optimized_score);
153
154    // Show improvement
155    let improvement = ((optimized_score - baseline_score) / baseline_score) * 100.0;
156    println!(
157        "\n✓ Improvement: {:.1}% ({:.3} -> {:.3})",
158        improvement, baseline_score, optimized_score
159    );
160
161    // Test on a new example
162    println!("\n--- Testing on a new example ---");
163    let test_example = example! {
164        "question": "input" => "What is the capital of France?",
165    };
166
167    let result = qa_module.forward(test_example).await?;
168    println!("Question: What is the capital of France?");
169    println!("Answer: {}", result.get("answer", None));
170
171    println!("\n=== Example Complete ===");
172    Ok(())
173}
examples/09-gepa-sentiment.rs (line 175)
116async fn main() -> Result<()> {
117    println!("GEPA Sentiment Analysis Optimization Example\n");
118
119    // Setup LM
120    let lm = LM::builder().temperature(0.7).build().await.unwrap();
121
122    configure(lm.clone(), ChatAdapter);
123
124    // Create training examples with diverse sentiments
125    let trainset = vec![
126        example! {
127            "text": "input" => "This movie was absolutely fantastic! I loved every minute of it.",
128            "expected_sentiment": "input" => "positive"
129        },
130        example! {
131            "text": "input" => "Terrible service, will never come back again.",
132            "expected_sentiment": "input" => "negative"
133        },
134        example! {
135            "text": "input" => "The weather is okay, nothing special.",
136            "expected_sentiment": "input" => "neutral"
137        },
138        example! {
139            "text": "input" => "Despite some minor issues, I'm quite happy with the purchase.",
140            "expected_sentiment": "input" => "positive"
141        },
142        example! {
143            "text": "input" => "I have mixed feelings about this product.",
144            "expected_sentiment": "input" => "neutral"
145        },
146        example! {
147            "text": "input" => "This is the worst experience I've ever had!",
148            "expected_sentiment": "input" => "negative"
149        },
150        example! {
151            "text": "input" => "It's fine. Does what it's supposed to do.",
152            "expected_sentiment": "input" => "neutral"
153        },
154        example! {
155            "text": "input" => "Exceeded all my expectations! Highly recommend!",
156            "expected_sentiment": "input" => "positive"
157        },
158        example! {
159            "text": "input" => "Disappointed and frustrated with the outcome.",
160            "expected_sentiment": "input" => "negative"
161        },
162        example! {
163            "text": "input" => "Standard quality, nothing remarkable.",
164            "expected_sentiment": "input" => "neutral"
165        },
166    ];
167
168    // Create module
169    let mut module = SentimentAnalyzer::builder()
170        .predictor(Predict::new(SentimentSignature::new()))
171        .build();
172
173    // Evaluate baseline performance
174    println!("Baseline Performance:");
175    let baseline_score = module.evaluate(trainset.clone()).await;
176    println!("  Average score: {:.3}\n", baseline_score);
177
178    // Configure GEPA optimizer
179    let gepa = GEPA::builder()
180        .num_iterations(5)
181        .minibatch_size(5)
182        .num_trials(3)
183        .temperature(0.9)
184        .track_stats(true)
185        .build();
186
187    // Run optimization
188    println!("Starting GEPA optimization...\n");
189    let result = gepa
190        .compile_with_feedback(&mut module, trainset.clone())
191        .await?;
192
193    // Display results
194    println!("\nOptimization Results:");
195    println!(
196        "  Best average score: {:.3}",
197        result.best_candidate.average_score()
198    );
199    println!("  Total rollouts: {}", result.total_rollouts);
200    println!("  Total LM calls: {}", result.total_lm_calls);
201    println!("  Generations: {}", result.evolution_history.len());
202
203    println!("\nBest Instruction:");
204    println!("  {}", result.best_candidate.instruction);
205
206    if !result.evolution_history.is_empty() {
207        println!("\nEvolution History:");
208        for entry in &result.evolution_history {
209            println!("  Generation {}: {:.3}", entry.0, entry.1);
210        }
211    }
212
213    // Test optimized module on a new example
214    println!("\nTesting Optimized Module:");
215    let test_example = example! {
216        "text": "input" => "This product changed my life! Absolutely amazing!",
217        "expected_sentiment": "input" => "positive"
218    };
219
220    let test_prediction = module.forward(test_example.clone()).await?;
221    let test_feedback = module
222        .feedback_metric(&test_example, &test_prediction)
223        .await;
224
225    println!(
226        "  Test prediction: {}",
227        test_prediction.get("sentiment", None)
228    );
229    println!("  Test score: {:.3}", test_feedback.score);
230    println!("  Feedback:\n{}", test_feedback.feedback);
231
232    Ok(())
233}
examples/10-gepa-llm-judge.rs (line 267)
219async fn main() -> Result<()> {
220    println!("GEPA with LLM-as-a-Judge Example\n");
221    println!("This example shows how to use an LLM judge to automatically");
222    println!("generate rich feedback for optimizing a math solver.\n");
223
224    // Setup: Configure the LLM
225    // Main LM for the task
226    let task_lm = LM::builder().temperature(0.7).build().await.unwrap();
227
228    // Judge LM (could use a different/cheaper model)
229    let judge_lm = LM::builder().temperature(0.3).build().await.unwrap();
230
231    configure(task_lm, ChatAdapter);
232
233    // Create training examples
234    let trainset = vec![
235        example! {
236            "problem": "input" => "Sarah has 12 apples. She gives 3 to her friend and buys 5 more. How many apples does she have now?",
237            "expected_answer": "input" => "14"
238        },
239        example! {
240            "problem": "input" => "A train travels 60 miles in 1 hour. How far will it travel in 3.5 hours at the same speed?",
241            "expected_answer": "input" => "210"
242        },
243        example! {
244            "problem": "input" => "There are 24 students in a class. If 1/3 of them are absent, how many students are present?",
245            "expected_answer": "input" => "16"
246        },
247        example! {
248            "problem": "input" => "A rectangle has length 8 cm and width 5 cm. What is its area?",
249            "expected_answer": "input" => "40"
250        },
251        example! {
252            "problem": "input" => "John has $50. He spends $12 on lunch and $8 on a book. How much money does he have left?",
253            "expected_answer": "input" => "30"
254        },
255    ];
256
257    // Create the module
258    let mut module = MathSolver::builder()
259        .solver(Predict::new(MathWordProblem::new()))
260        .judge(Predict::new(MathJudge::new()))
261        .judge_lm(Arc::new(judge_lm))
262        .build();
263
264    // Evaluate baseline performance
265    println!("Step 1: Baseline Performance");
266    println!("Testing the solver before optimization...\n");
267    let baseline_score = module.evaluate(trainset.clone()).await;
268    println!("  Baseline average score: {:.3}\n", baseline_score);
269
270    // Configure GEPA optimizer
271    println!("Step 2: Configure GEPA");
272    println!("Setting up the optimizer with budget controls...\n");
273
274    let gepa = GEPA::builder()
275        .num_iterations(3) // Fewer iterations for demo
276        .minibatch_size(3) // Smaller batches
277        .temperature(0.9)
278        .track_stats(true)
279        .maybe_max_lm_calls(Some(100)) // Important: we're using 2x LM calls (task + judge)
280        .build();
281
282    // Run GEPA optimization
283    println!("Step 3: Run GEPA Optimization");
284    println!("The judge will analyze reasoning quality and provide feedback...\n");
285
286    let result = gepa
287        .compile_with_feedback(&mut module, trainset.clone())
288        .await?;
289
290    // Display results
291    println!("\nStep 4: Results");
292    println!("===============\n");
293    println!("Optimization complete!");
294    println!(
295        "  Best average score: {:.3}",
296        result.best_candidate.average_score()
297    );
298    println!(
299        "  Improvement: {:.3}",
300        result.best_candidate.average_score() - baseline_score
301    );
302    println!("  Total rollouts: {}", result.total_rollouts);
303    println!(
304        "  Total LM calls: {} (includes judge evaluations)",
305        result.total_lm_calls
306    );
307
308    println!("\nEvolution over time:");
309    for (generation, score) in &result.evolution_history {
310        println!("  Generation {}: {:.3}", generation, score);
311    }
312
313    println!("\nOptimized instruction:");
314    println!("  {}", result.best_candidate.instruction);
315
316    // Test the optimized solver
317    println!("\nStep 5: Test Optimized Solver");
318    println!("==============================\n");
319
320    let test_problem = example! {
321        "problem": "input" => "A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?",
322        "expected_answer": "input" => "2"
323    };
324
325    let test_prediction = module.forward(test_problem.clone()).await?;
326    let test_feedback = module
327        .feedback_metric(&test_problem, &test_prediction)
328        .await;
329
330    println!(
331        "Test problem: A store sells pencils for $0.25 each. If you buy 8 pencils, how much will you pay?"
332    );
333    println!("\nAnswer: {}", test_prediction.get("answer", None));
334    println!("Score: {:.3}\n", test_feedback.score);
335    println!("Detailed Feedback from Judge:");
336    println!("{}", test_feedback.feedback);
337
338    Ok(())
339}

Dyn Compatibility§

This trait is not dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety", so this trait is not object safe.

Implementors§