1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
//! Unified evaluation report.
//!
//! Provides a single cohesive report structure that aggregates results from
//! all evaluation modules. This is the primary output type users should work with.
//!
//! # Example
//!
//! ```rust
//! use anno_eval::eval::report::{EvalReport, ReportBuilder};
//! use anno::RegexNER;
//!
//! let model = RegexNER::new();
//! let report = ReportBuilder::new("RegexNER")
//! .with_core_metrics(true)
//! .with_bias_analysis(false) // Skip if no PER/ORG support
//! .with_error_analysis(true)
//! .build(&model);
//!
//! println!("{}", report.summary());
//! ```
use anno::{Model, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
// =============================================================================
// Core Report Structure
// =============================================================================
/// Unified evaluation report aggregating all analysis results.
///
/// Instead of 16 different Results structs, this provides one cohesive view.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalReport {
/// Model/system identifier
pub model_name: String,
/// Timestamp of evaluation
pub timestamp: String,
/// Core NER metrics (always present)
pub core: CoreMetrics,
/// Per-entity-type breakdown
pub per_type: HashMap<String, TypeMetrics>,
/// Error analysis (if enabled)
pub errors: Option<ErrorSummary>,
/// Bias analysis (if enabled and applicable)
pub bias: Option<BiasSummary>,
/// Data quality findings (if enabled)
pub data_quality: Option<DataQualitySummary>,
/// Calibration analysis (if model provides confidence)
pub calibration: Option<CalibrationSummary>,
/// Recommendations based on findings
pub recommendations: Vec<Recommendation>,
/// Raw warnings/notes generated during evaluation
pub warnings: Vec<String>,
}
/// Core precision/recall/F1 metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CoreMetrics {
/// Micro-averaged precision (total_correct / total_predicted)
pub precision: f64,
/// Micro-averaged recall (total_correct / total_gold)
pub recall: f64,
/// Micro-averaged F1
pub f1: f64,
/// Total entities in gold standard
pub total_gold: usize,
/// Total entities predicted
pub total_predicted: usize,
/// Total correct predictions
pub total_correct: usize,
/// Macro-averaged F1 (for comparison)
pub macro_f1: Option<f64>,
}
/// Per-type metrics breakdown.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TypeMetrics {
/// Precision for this type
pub precision: f64,
/// Recall for this type
pub recall: f64,
/// F1 for this type
pub f1: f64,
/// Number of gold entities of this type
pub support: usize,
/// Number of predicted entities of this type
pub predicted: usize,
/// Number of correctly predicted entities
pub correct: usize,
}
/// Error analysis summary.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ErrorSummary {
/// Total errors
pub total_errors: usize,
/// Boundary errors (correct type, wrong span)
pub boundary_errors: usize,
/// Type errors (correct span, wrong type)
pub type_errors: usize,
/// False positives (spurious entities)
pub false_positives: usize,
/// False negatives (missed entities)
pub false_negatives: usize,
/// Most common error patterns
pub top_patterns: Vec<String>,
}
/// Bias analysis summary.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BiasSummary {
/// Whether significant bias was detected
pub bias_detected: bool,
/// Gender bias metrics (if applicable)
pub gender: Option<GenderBiasMetrics>,
/// Demographic bias metrics (if applicable)
pub demographic: Option<DemographicBiasMetrics>,
/// Length bias (short vs long entities)
pub length: Option<LengthBiasMetrics>,
}
/// Gender bias metrics from WinoBias-style evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenderBiasMetrics {
/// Accuracy on pro-stereotypical examples
pub pro_stereotype_accuracy: f64,
/// Accuracy on anti-stereotypical examples
pub anti_stereotype_accuracy: f64,
/// Gap between pro and anti (lower is better)
pub gap: f64,
/// Human-readable verdict
pub verdict: String,
}
/// Demographic bias metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DemographicBiasMetrics {
/// Performance gap between best and worst demographic groups
pub max_gap: f64,
/// Groups with notably lower performance
pub underperforming_groups: Vec<String>,
}
/// Entity length bias metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LengthBiasMetrics {
/// F1 on short entities (1-2 tokens)
pub short_entity_f1: f64,
/// F1 on long entities (4+ tokens)
pub long_entity_f1: f64,
/// Gap between short and long (lower is better)
pub gap: f64,
}
/// Data quality findings.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataQualitySummary {
/// Potential train/test leakage detected
pub leakage_detected: bool,
/// Percentage of redundant examples
pub redundancy_rate: f64,
/// Ambiguous entity annotations found
pub ambiguous_count: usize,
}
/// Calibration analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CalibrationSummary {
/// Expected Calibration Error
pub ece: f64,
/// Maximum Calibration Error
pub mce: f64,
/// Optimal confidence threshold
pub optimal_threshold: f64,
/// Grade (A/B/C/D/F)
pub grade: char,
}
/// Actionable recommendation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Recommendation {
/// Priority: high, medium, low
pub priority: Priority,
/// Category of recommendation
pub category: RecommendationCategory,
/// Human-readable recommendation
pub message: String,
/// Estimated impact if addressed
pub estimated_impact: Option<String>,
}
/// Priority level for recommendations.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Priority {
/// Urgent - blocks deployment
High,
/// Important - should be addressed
Medium,
/// Nice to have
Low,
}
/// Category of recommendation.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum RecommendationCategory {
/// Core performance issues (F1/P/R)
Performance,
/// Bias-related issues
Bias,
/// Dataset quality issues
DataQuality,
/// Confidence calibration issues
Calibration,
/// Entity type coverage issues
Coverage,
}
// =============================================================================
// Report Builder
// =============================================================================
/// Builder for constructing evaluation reports.
pub struct ReportBuilder {
model_name: String,
include_core: bool,
include_errors: bool,
include_bias: bool,
include_data_quality: bool,
include_calibration: bool,
test_data: Option<Vec<TestCase>>,
}
/// A single test case for evaluation.
pub struct TestCase {
/// Input text
pub text: String,
/// Gold standard entities
pub gold_entities: Vec<SimpleGoldEntity>,
}
/// Simplified gold entity for report generation.
///
/// This is a simplified version with string-based entity types,
/// designed for report generation where type normalization is handled externally.
///
/// For evaluation code, use [`super::datasets::GoldEntity`] instead.
#[derive(Debug, Clone)]
pub struct SimpleGoldEntity {
/// Entity text
pub text: String,
/// Entity type label (e.g., "PER", "ORG", "DATE")
pub entity_type: String,
/// Start character offset
pub start: usize,
/// End character offset (exclusive)
pub end: usize,
}
impl SimpleGoldEntity {
/// Safely extract text from source using character offsets.
///
/// SimpleGoldEntity stores character offsets, not byte offsets. This method
/// correctly extracts text by iterating over characters.
///
/// # Arguments
/// * `source_text` - The original text from which this entity was extracted
///
/// # Returns
/// The extracted text, or empty string if offsets are invalid
#[must_use]
pub fn extract_text(&self, source_text: &str) -> String {
let char_count = source_text.chars().count();
if self.start >= char_count || self.end > char_count || self.start >= self.end {
return String::new();
}
source_text
.chars()
.skip(self.start)
.take(self.end - self.start)
.collect()
}
}
impl ReportBuilder {
/// Create a new report builder.
pub fn new(model_name: &str) -> Self {
Self {
model_name: model_name.to_string(),
include_core: true,
include_errors: true,
include_bias: false,
include_data_quality: false,
include_calibration: false,
test_data: None,
}
}
/// Include core metrics (default: true).
pub fn with_core_metrics(mut self, include: bool) -> Self {
self.include_core = include;
self
}
/// Include error analysis (default: true).
pub fn with_error_analysis(mut self, include: bool) -> Self {
self.include_errors = include;
self
}
/// Include bias analysis (default: false).
/// Only meaningful for models that detect PER/ORG entities.
pub fn with_bias_analysis(mut self, include: bool) -> Self {
self.include_bias = include;
self
}
/// Include data quality checks (default: false).
pub fn with_data_quality(mut self, include: bool) -> Self {
self.include_data_quality = include;
self
}
/// Include calibration analysis (default: false).
/// Only meaningful for models that provide confidence scores.
pub fn with_calibration(mut self, include: bool) -> Self {
self.include_calibration = include;
self
}
/// Set test data for evaluation.
pub fn with_test_data(mut self, data: Vec<TestCase>) -> Self {
self.test_data = Some(data);
self
}
/// Run bias analysis using EvalSystem.
#[cfg(feature = "eval-bias")]
fn run_bias_analysis<M: Model>(model: &M) -> Result<BiasSummary> {
use crate::eval::coref_resolver::SimpleCorefResolver;
use crate::eval::demographic_bias::{
create_diverse_name_dataset, DemographicBiasEvaluator,
};
use crate::eval::gender_bias::{create_winobias_templates, GenderBiasEvaluator};
// Run demographic bias analysis
let names = create_diverse_name_dataset();
let evaluator = DemographicBiasEvaluator::new(true);
let demo_results = evaluator.evaluate_ner(model, &names);
// Gender bias (coreference)
let resolver = SimpleCorefResolver::default();
let templates = create_winobias_templates();
let gender_evaluator = GenderBiasEvaluator::new(true);
let gender_results = gender_evaluator.evaluate_resolver(&resolver, &templates);
// Determine if bias was detected
let bias_detected =
gender_results.bias_gap > 0.1 || demo_results.ethnicity_parity_gap > 0.1;
// Find underperforming groups
let mut underperforming_groups = Vec::new();
for (ethnicity, rate) in &demo_results.by_ethnicity {
if *rate < demo_results.overall_recognition_rate - 0.1 {
underperforming_groups.push(ethnicity.clone());
}
}
Ok(BiasSummary {
bias_detected,
gender: Some(GenderBiasMetrics {
pro_stereotype_accuracy: gender_results.pro_stereotype_accuracy,
anti_stereotype_accuracy: gender_results.anti_stereotype_accuracy,
gap: gender_results.bias_gap,
verdict: if gender_results.bias_gap > 0.1 {
"Significant gender bias detected".to_string()
} else {
"No significant gender bias".to_string()
},
}),
demographic: Some(DemographicBiasMetrics {
max_gap: demo_results
.ethnicity_parity_gap
.max(demo_results.script_bias_gap),
underperforming_groups,
}),
length: None, // Can be added if needed
})
}
/// Run calibration analysis.
#[cfg(feature = "eval")]
fn run_calibration_analysis<M: Model>(
model: &M,
test_cases: &[TestCase],
) -> Result<CalibrationSummary> {
use crate::eval::calibration::CalibrationEvaluator;
// Collect predictions with confidence scores
let mut predictions = Vec::new();
let mut has_calibrated_entities = false;
for case in test_cases {
let entities = model
.extract_entities(&case.text, None)
.unwrap_or_else(|_| Vec::new());
for entity in &entities {
// Check if this entity's extraction method is calibrated
let is_calibrated = entity
.provenance
.as_ref()
.map(|p| p.method.is_calibrated())
.unwrap_or(false);
if !is_calibrated {
continue; // Skip uncalibrated entities
}
has_calibrated_entities = true;
// Match to gold to determine correctness
let is_correct = case.gold_entities.iter().any(|gold| {
entity.start() == gold.start
&& entity.end() == gold.end
&& entity.entity_type.as_label() == gold.entity_type
});
predictions.push((entity.confidence.into(), is_correct));
}
}
// If no calibrated entities found, return a warning summary
if !has_calibrated_entities || predictions.is_empty() {
return Ok(CalibrationSummary {
ece: 0.0,
mce: 0.0,
optimal_threshold: 0.5,
grade: '?', // Unknown - no calibrated predictions
});
}
// Compute calibration metrics
let results = CalibrationEvaluator::compute(&predictions);
// Find optimal threshold (highest accuracy with reasonable coverage)
let optimal_threshold = results
.threshold_accuracy
.iter()
.filter(|(_, metrics)| metrics.coverage >= 0.1) // At least 10% coverage
.max_by(|(_, a), (_, b)| {
a.accuracy
.partial_cmp(&b.accuracy)
.unwrap_or(std::cmp::Ordering::Equal)
})
.and_then(|(thresh_str, _)| thresh_str.parse::<f64>().ok())
.unwrap_or(0.5);
// Convert ECE to grade
let grade = if results.ece < 0.05 {
'A'
} else if results.ece < 0.10 {
'B'
} else if results.ece < 0.15 {
'C'
} else if results.ece < 0.25 {
'D'
} else {
'F'
};
Ok(CalibrationSummary {
ece: results.ece,
mce: results.mce,
optimal_threshold,
grade,
})
}
/// Run data quality checks.
#[cfg(feature = "eval")]
fn run_data_quality_checks(test_cases: &[TestCase]) -> Result<DataQualitySummary> {
use std::collections::{HashMap, HashSet};
if test_cases.is_empty() {
return Ok(DataQualitySummary {
leakage_detected: false,
redundancy_rate: 0.0,
ambiguous_count: 0,
});
}
// Convert test cases to format expected by DatasetQualityAnalyzer
// Since we only have test data (no train), we check for redundancy within test set
let test_data: Vec<(&str, Vec<(&str, &str)>)> = test_cases
.iter()
.map(|case| {
let entities: Vec<(&str, &str)> = case
.gold_entities
.iter()
.map(|e| (e.text.as_str(), e.entity_type.as_str()))
.collect();
(case.text.as_str(), entities)
})
.collect();
// Check for redundancy (duplicates within test set)
let mut seen_texts = HashSet::new();
let mut duplicate_count = 0;
for (text, _) in &test_data {
let normalized = text.to_lowercase();
if !seen_texts.insert(normalized) {
duplicate_count += 1;
}
}
let redundancy_rate = if test_data.is_empty() {
0.0
} else {
duplicate_count as f64 / test_data.len() as f64
};
// Check for ambiguous entities (same text, different types)
let mut text_to_types: HashMap<String, HashSet<String>> = HashMap::new();
for (_, entities) in &test_data {
for (text, entity_type) in entities {
text_to_types
.entry(text.to_lowercase())
.or_default()
.insert(entity_type.to_string());
}
}
let ambiguous_count = text_to_types
.values()
.filter(|types| types.len() > 1)
.count();
// Note: We can't check for train-test leakage since we only have test data
// This would require access to training data, which is not available in this context
Ok(DataQualitySummary {
leakage_detected: false, // Cannot determine without train data
redundancy_rate,
ambiguous_count,
})
}
/// Build the report by running the model on test data.
pub fn build<M: Model>(self, model: &M) -> EvalReport {
let timestamp = chrono_lite_timestamp();
let mut warnings = Vec::new();
let mut recommendations = Vec::new();
// Get test data (use synthetic if none provided)
let test_cases = self.test_data.unwrap_or_else(|| {
warnings.push("Using synthetic test data (no custom data provided)".into());
default_synthetic_cases()
});
// Run model predictions
let mut total_gold = 0;
let mut total_predicted = 0;
let mut total_correct = 0;
let mut per_type_stats: HashMap<String, (usize, usize, usize)> = HashMap::new();
let mut all_errors = Vec::new();
for case in &test_cases {
let predictions = model
.extract_entities(&case.text, None)
.unwrap_or_else(|e| {
warnings.push(format!("Failed to extract entities for test case: {}", e));
Vec::new()
});
total_gold += case.gold_entities.len();
total_predicted += predictions.len();
// Match predictions to gold
for gold in &case.gold_entities {
let type_key = gold.entity_type.clone();
let entry = per_type_stats.entry(type_key.clone()).or_insert((0, 0, 0));
entry.0 += 1; // gold count
let matched = predictions.iter().any(|p| {
p.start() == gold.start
&& p.end() == gold.end
&& p.entity_type.as_label() == gold.entity_type
});
if matched {
total_correct += 1;
entry.2 += 1; // correct count
} else {
all_errors.push(format!("Missed: {} ({})", gold.text, gold.entity_type));
}
}
for pred in &predictions {
let type_key = pred.entity_type.as_label().to_string();
let entry = per_type_stats.entry(type_key).or_insert((0, 0, 0));
entry.1 += 1; // predicted count
}
}
// Compute core metrics
let precision = if total_predicted > 0 {
total_correct as f64 / total_predicted as f64
} else {
0.0
};
let recall = if total_gold > 0 {
total_correct as f64 / total_gold as f64
} else {
0.0
};
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
let core = CoreMetrics {
precision,
recall,
f1,
total_gold,
total_predicted,
total_correct,
macro_f1: None, // Computed below if needed
};
// Per-type metrics
let mut per_type = HashMap::new();
let mut type_f1s = Vec::new();
for (type_name, (gold, pred, correct)) in &per_type_stats {
let p = if *pred > 0 {
*correct as f64 / *pred as f64
} else {
0.0
};
let r = if *gold > 0 {
*correct as f64 / *gold as f64
} else {
0.0
};
let f = if p + r > 0.0 {
2.0 * p * r / (p + r)
} else {
0.0
};
type_f1s.push(f);
per_type.insert(
type_name.clone(),
TypeMetrics {
precision: p,
recall: r,
f1: f,
support: *gold,
predicted: *pred,
correct: *correct,
},
);
}
// Generate recommendations
if f1 < 0.5 {
recommendations.push(Recommendation {
priority: Priority::High,
category: RecommendationCategory::Performance,
message: format!(
"F1 score ({:.1}%) is below acceptable threshold",
f1 * 100.0
),
estimated_impact: Some("Core functionality compromised".into()),
});
}
if recall < precision * 0.7 {
recommendations.push(Recommendation {
priority: Priority::Medium,
category: RecommendationCategory::Coverage,
message: "Recall significantly lower than precision - model is too conservative"
.into(),
estimated_impact: Some("Missing many valid entities".into()),
});
}
// Error summary
let errors = if self.include_errors {
let false_negatives = total_gold - total_correct;
let false_positives = total_predicted - total_correct;
Some(ErrorSummary {
total_errors: false_negatives + false_positives,
boundary_errors: 0, // Would need span comparison
type_errors: 0, // Would need type comparison
false_positives,
false_negatives,
top_patterns: all_errors.into_iter().take(5).collect(),
})
} else {
None
};
// Bias analysis (if enabled) - use EvalSystem
let bias = if self.include_bias {
#[cfg(feature = "eval-bias")]
{
// Create a boxed model for EvalSystem
// Note: This requires cloning or wrapping the model
// For now, we'll use a simplified approach
match Self::run_bias_analysis(model) {
Ok(bias_results) => Some(bias_results),
Err(e) => {
warnings.push(format!("Bias analysis failed: {}", e));
None
}
}
}
#[cfg(not(feature = "eval-bias"))]
{
None
}
} else {
None
};
// Calibration (if enabled)
let calibration = if self.include_calibration {
#[cfg(feature = "eval")]
{
match Self::run_calibration_analysis(model, &test_cases) {
Ok(cal_results) => Some(cal_results),
Err(e) => {
warnings.push(format!("Calibration analysis failed: {}", e));
None
}
}
}
#[cfg(not(feature = "eval"))]
{
None
}
} else {
None
};
// Data quality (if enabled)
let data_quality = if self.include_data_quality {
#[cfg(feature = "eval")]
{
match Self::run_data_quality_checks(&test_cases) {
Ok(quality_results) => Some(quality_results),
Err(e) => {
warnings.push(format!("Data quality checks failed: {}", e));
None
}
}
}
#[cfg(not(feature = "eval"))]
{
None
}
} else {
None
};
EvalReport {
model_name: self.model_name,
timestamp,
core,
per_type,
errors,
bias,
data_quality,
calibration,
recommendations,
warnings,
}
}
}
// =============================================================================
// Display Implementation
// =============================================================================
impl EvalReport {
/// Generate a human-readable summary.
pub fn summary(&self) -> String {
let mut out = String::new();
out.push_str(&format!("=== Evaluation Report: {} ===\n", self.model_name));
out.push_str(&format!("Generated: {}\n\n", self.timestamp));
// Core metrics
out.push_str("## Core Metrics\n");
out.push_str(&format!(
" Precision: {:.1}%\n",
self.core.precision * 100.0
));
out.push_str(&format!(" Recall: {:.1}%\n", self.core.recall * 100.0));
out.push_str(&format!(" F1: {:.1}%\n", self.core.f1 * 100.0));
out.push_str(&format!(
" ({} correct / {} predicted / {} gold)\n\n",
self.core.total_correct, self.core.total_predicted, self.core.total_gold
));
// Per-type breakdown
if !self.per_type.is_empty() {
out.push_str("## Per-Type Breakdown\n");
let mut types: Vec<_> = self.per_type.iter().collect();
types.sort_by_key(|b| std::cmp::Reverse(b.1.support));
for (type_name, metrics) in types {
out.push_str(&format!(
" {:12} P={:.0}% R={:.0}% F1={:.0}% (n={})\n",
type_name,
metrics.precision * 100.0,
metrics.recall * 100.0,
metrics.f1 * 100.0,
metrics.support
));
}
out.push('\n');
}
// Error summary
if let Some(ref errors) = self.errors {
out.push_str("## Error Analysis\n");
out.push_str(&format!(" Total errors: {}\n", errors.total_errors));
out.push_str(&format!(" False positives: {}\n", errors.false_positives));
out.push_str(&format!(" False negatives: {}\n", errors.false_negatives));
if !errors.top_patterns.is_empty() {
out.push_str(" Sample errors:\n");
for pattern in &errors.top_patterns {
out.push_str(&format!(" - {}\n", pattern));
}
}
out.push('\n');
}
// Recommendations
if !self.recommendations.is_empty() {
out.push_str("## Recommendations\n");
for rec in &self.recommendations {
let priority = match rec.priority {
Priority::High => "[HIGH]",
Priority::Medium => "[MED]",
Priority::Low => "[LOW]",
};
out.push_str(&format!(" {} {}\n", priority, rec.message));
}
out.push('\n');
}
// Warnings
if !self.warnings.is_empty() {
out.push_str("## Warnings\n");
for warning in &self.warnings {
out.push_str(&format!(" - {}\n", warning));
}
}
out
}
/// Export report as JSON.
pub fn to_json(&self) -> Result<String> {
serde_json::to_string_pretty(self)
.map_err(|e| crate::Error::InvalidInput(format!("JSON serialization failed: {}", e)))
}
}
impl fmt::Display for EvalReport {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.summary())
}
}
// =============================================================================
// Helpers
// =============================================================================
fn chrono_lite_timestamp() -> String {
// Simple timestamp without chrono dependency
use std::time::{SystemTime, UNIX_EPOCH};
let duration = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default();
format!("{}s since epoch", duration.as_secs())
}
fn default_synthetic_cases() -> Vec<TestCase> {
// Minimal synthetic test set for quick evaluation
vec![
TestCase {
text: "Meeting on January 15, 2024 at 3:00 PM".into(),
gold_entities: vec![
SimpleGoldEntity {
text: "January 15, 2024".into(),
entity_type: "DATE".into(),
start: 11,
end: 27,
},
SimpleGoldEntity {
text: "3:00 PM".into(),
entity_type: "TIME".into(),
start: 31,
end: 38,
},
],
},
TestCase {
text: "Contact: user@example.com or call 555-1234".into(),
gold_entities: vec![
SimpleGoldEntity {
text: "user@example.com".into(),
entity_type: "EMAIL".into(),
start: 9,
end: 25,
},
SimpleGoldEntity {
text: "555-1234".into(),
entity_type: "PHONE".into(),
start: 34,
end: 42,
},
],
},
TestCase {
text: "Invoice total: $1,234.56 USD".into(),
gold_entities: vec![SimpleGoldEntity {
text: "$1,234.56".into(),
entity_type: "MONEY".into(),
start: 15,
end: 24,
}],
},
]
}
// =============================================================================
// Tests
// =============================================================================
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_report_builder_basic() {
use crate::RegexNER;
let model = RegexNER::new();
let report = ReportBuilder::new("RegexNER")
.with_error_analysis(true)
.build(&model);
assert_eq!(report.model_name, "RegexNER");
assert!(report.core.total_gold > 0);
}
#[test]
fn test_report_summary_format() {
let report = EvalReport {
model_name: "TestModel".into(),
timestamp: "2024-01-01".into(),
core: CoreMetrics {
precision: 0.85,
recall: 0.75,
f1: 0.80,
total_gold: 100,
total_predicted: 90,
total_correct: 75,
macro_f1: None,
},
per_type: HashMap::new(),
errors: None,
bias: None,
data_quality: None,
calibration: None,
recommendations: vec![],
warnings: vec![],
};
let summary = report.summary();
assert!(summary.contains("TestModel"));
assert!(summary.contains("85.0%")); // precision
assert!(summary.contains("75.0%")); // recall
}
#[test]
fn test_report_json_export() {
let report = EvalReport {
model_name: "TestModel".into(),
timestamp: "test".into(),
core: CoreMetrics {
precision: 0.9,
recall: 0.8,
f1: 0.85,
total_gold: 10,
total_predicted: 10,
total_correct: 8,
macro_f1: None,
},
per_type: HashMap::new(),
errors: None,
bias: None,
data_quality: None,
calibration: None,
recommendations: vec![],
warnings: vec![],
};
let json = report.to_json().unwrap();
assert!(json.contains("\"model_name\": \"TestModel\""));
assert!(json.contains("\"f1\": 0.85"));
}
#[test]
fn test_recommendations_generated() {
use crate::RegexNER;
let model = RegexNER::new();
// Create test data that will result in low F1
let test_data = vec![TestCase {
text: "John Smith works at Google".into(),
gold_entities: vec![
SimpleGoldEntity {
text: "John Smith".into(),
entity_type: "PER".into(),
start: 0,
end: 10,
},
SimpleGoldEntity {
text: "Google".into(),
entity_type: "ORG".into(),
start: 20,
end: 26,
},
],
}];
let report = ReportBuilder::new("RegexNER")
.with_test_data(test_data)
.build(&model);
// RegexNER can't detect PER/ORG, so F1 should be low
// and recommendations should be generated
assert!(
report.core.f1 < 0.5
|| !report.recommendations.is_empty()
|| report.core.total_gold == 0
);
}
}