use crate::eval::datasets::GoldEntity;
use crate::eval::loader::{DatasetId, DatasetLoader};
use crate::eval::synthetic::{
all_datasets, datasets_by_difficulty, datasets_by_domain, AnnotatedExample, Difficulty, Domain,
};
use crate::eval::types::MetricWithVariance;
use crate::eval::{evaluate_ner_model, TypeMetrics};
use anno::{Error, Model, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Instant;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalConfig {
pub max_examples_per_dataset: usize,
pub breakdown_by_difficulty: bool,
pub breakdown_by_domain: bool,
pub breakdown_by_type: bool,
pub warmup: bool,
pub warmup_iterations: usize,
pub min_confidence: Option<f64>,
pub cache_dir: Option<String>,
pub normalize_types: bool,
}
impl Default for EvalConfig {
fn default() -> Self {
Self {
max_examples_per_dataset: 0, breakdown_by_difficulty: true,
breakdown_by_domain: true,
breakdown_by_type: true,
warmup: true,
warmup_iterations: 1,
min_confidence: None,
cache_dir: None,
normalize_types: false, }
}
}
impl EvalConfig {
pub fn quick() -> Self {
Self {
max_examples_per_dataset: 100,
breakdown_by_difficulty: false,
breakdown_by_domain: false,
breakdown_by_type: true,
warmup: false,
warmup_iterations: 0,
min_confidence: None,
cache_dir: None,
normalize_types: false,
}
}
pub fn full() -> Self {
Self {
max_examples_per_dataset: 0,
breakdown_by_difficulty: true,
breakdown_by_domain: true,
breakdown_by_type: true,
warmup: true,
warmup_iterations: 2,
min_confidence: None,
cache_dir: None,
normalize_types: true, }
}
pub fn ci_aware() -> Self {
let in_ci = std::env::var("CI").is_ok() || std::env::var("GITHUB_ACTIONS").is_ok();
let default_max = if in_ci { 50 } else { 100 };
let max_examples = std::env::var("ANNO_MAX_EXAMPLES")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(default_max);
Self {
max_examples_per_dataset: max_examples,
breakdown_by_difficulty: !in_ci,
breakdown_by_domain: !in_ci,
breakdown_by_type: true,
warmup: !in_ci,
warmup_iterations: if in_ci { 0 } else { 1 },
min_confidence: None,
cache_dir: None,
normalize_types: false,
}
}
#[must_use]
pub fn with_type_normalization(mut self) -> Self {
self.normalize_types = true;
self
}
}
pub struct BackendRegistry {
backends: Vec<(String, String, Box<dyn Model>)>, }
impl BackendRegistry {
#[must_use]
pub fn new() -> Self {
Self {
backends: Vec::new(),
}
}
pub fn register(
&mut self,
name: impl Into<String>,
description: impl Into<String>,
model: Box<dyn Model>,
) {
self.backends.push((name.into(), description.into(), model));
}
pub fn len(&self) -> usize {
self.backends.len()
}
pub fn is_empty(&self) -> bool {
self.backends.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = (&str, &str, &dyn Model)> {
self.backends
.iter()
.map(|(name, desc, model)| (name.as_str(), desc.as_str(), model.as_ref()))
}
pub fn register_defaults(&mut self) {
use anno::{HeuristicNER, RegexNER, StackedNER};
self.register(
"RegexNER",
"Regex patterns (DATE/MONEY/EMAIL/etc.)",
Box::new(RegexNER::new()),
);
self.register(
"HeuristicNER",
"Heuristics (PER/ORG/LOC)",
Box::new(HeuristicNER::new()),
);
self.register(
"StackedNER",
"Pattern + Statistical combined",
Box::new(StackedNER::new()),
);
}
#[cfg(feature = "onnx")]
pub fn register_onnx(&mut self) {
use crate::{BertNEROnnx, GLiNEROnnx, DEFAULT_BERT_ONNX_MODEL, DEFAULT_GLINER_MODEL};
match GLiNEROnnx::new(DEFAULT_GLINER_MODEL) {
Ok(gliner) => {
self.register("GLiNER", "Zero-shot NER via ONNX", Box::new(gliner));
}
Err(e) => {
log::warn!("Failed to load GLiNER ONNX: {}", e);
}
}
match BertNEROnnx::new(DEFAULT_BERT_ONNX_MODEL) {
Ok(bert) => {
self.register("BertNEROnnx", "BERT NER via ONNX", Box::new(bert));
}
Err(e) => {
log::warn!("Failed to load BERT ONNX: {}", e);
}
}
}
#[cfg(feature = "candle")]
pub fn register_candle(&mut self) {
use crate::{CandleNER, DEFAULT_CANDLE_MODEL};
match CandleNER::from_pretrained(DEFAULT_CANDLE_MODEL) {
Ok(candle) => {
self.register(
"CandleNER",
"Pure Rust BERT NER via Candle",
Box::new(candle),
);
}
Err(e) => {
log::warn!("Failed to load Candle NER: {}", e);
}
}
}
#[cfg(any(feature = "onnx", feature = "candle"))]
pub fn register_gliner_multitask(&mut self, model_id: &str) {
use crate::backends::gliner_multitask::GLiNERMultitask;
match GLiNERMultitask::from_pretrained(model_id) {
Ok(model) => {
self.register(
"GLiNERMultitask",
"Multi-task zero-shot NER, classification, structure",
Box::new(model),
);
}
Err(e) => {
log::warn!("Failed to load GLiNER multi-task from {}: {}", model_id, e);
}
}
}
#[cfg(any(feature = "onnx", feature = "candle"))]
pub fn register_gliner_multitask_default(&mut self) {
self.register_gliner_multitask(anno::DEFAULT_GLINER_MULTITASK_MODEL);
}
pub fn register_stack(
&mut self,
name: impl Into<String>,
layer_names: &[&str],
strategy: anno::backends::stacked::ConflictStrategy,
) {
use anno::backends::stacked::StackedNERBuilder;
use anno::{HeuristicNER, RegexNER};
let name = name.into();
let mut builder = StackedNERBuilder::default().strategy(strategy);
for layer_name in layer_names {
match *layer_name {
"RegexNER" | "pattern" => {
builder = builder.layer(RegexNER::new());
}
"HeuristicNER" | "heuristic" => {
builder = builder.layer(HeuristicNER::new());
}
_ => {
eprintln!(
"Warning: Unknown layer '{}' in stack '{}'",
layer_name, name
);
}
}
}
let description = format!("Stack: {} ({:?})", layer_names.join(" -> "), strategy);
self.register(name, description, Box::new(builder.build()));
}
pub fn register_all_combinations(&mut self) {
use anno::backends::stacked::ConflictStrategy;
self.register_defaults();
self.register_stack(
"Heuristic->Pattern",
&["HeuristicNER", "RegexNER"],
ConflictStrategy::HighestConf,
);
self.register_stack(
"Stack_LongestSpan",
&["RegexNER", "HeuristicNER"],
ConflictStrategy::LongestSpan,
);
self.register_stack(
"Stack_Priority",
&["RegexNER", "HeuristicNER"],
ConflictStrategy::Priority,
);
self.register_stack(
"Stack_Union",
&["RegexNER", "HeuristicNER"],
ConflictStrategy::Union,
);
}
}
impl Default for BackendRegistry {
fn default() -> Self {
let mut registry = Self::new();
registry.register_defaults();
#[cfg(feature = "onnx")]
registry.register_onnx();
#[cfg(feature = "candle")]
registry.register_candle();
registry
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BackendDatasetResult {
pub backend_name: String,
pub dataset_name: String,
pub num_examples: usize,
pub num_gold_entities: usize,
pub precision: f64,
pub recall: f64,
pub f1: f64,
pub macro_f1: Option<f64>,
pub found: usize,
pub expected: usize,
pub per_type: HashMap<String, TypeMetrics>,
pub duration_ms: f64,
pub tokens_per_second: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BackendAggregateResult {
pub backend_name: String,
pub description: String,
pub f1: MetricWithVariance,
pub precision: MetricWithVariance,
pub recall: MetricWithVariance,
pub total_examples: usize,
pub total_found: usize,
pub total_expected: usize,
pub total_duration_ms: f64,
pub per_dataset: Vec<BackendDatasetResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalResults {
pub timestamp: String,
pub config: EvalConfig,
pub backends: Vec<BackendAggregateResult>,
pub by_difficulty: Option<HashMap<String, Vec<BackendDatasetResult>>>,
pub by_domain: Option<HashMap<String, Vec<BackendDatasetResult>>>,
pub dataset_stats: DatasetStatsSummary,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DatasetStatsSummary {
pub total_examples: usize,
pub total_entities: usize,
pub entity_type_distribution: HashMap<String, usize>,
pub domain_distribution: HashMap<String, usize>,
pub difficulty_distribution: HashMap<String, usize>,
}
pub struct EvalHarness {
config: EvalConfig,
registry: BackendRegistry,
loader: Option<DatasetLoader>,
}
impl EvalHarness {
pub fn new(config: EvalConfig) -> Result<Self> {
let loader = if let Some(ref dir) = config.cache_dir {
Some(DatasetLoader::with_cache_dir(dir)?)
} else {
DatasetLoader::new().ok()
};
Ok(Self {
config,
registry: BackendRegistry::new(),
loader,
})
}
pub fn with_defaults() -> Result<Self> {
let mut harness = Self::new(EvalConfig::default())?;
harness.registry = BackendRegistry::default();
Ok(harness)
}
pub fn with_config(config: EvalConfig) -> Result<Self> {
let mut harness = Self::new(config)?;
harness.registry = BackendRegistry::default();
Ok(harness)
}
pub fn register(
&mut self,
name: impl Into<String>,
description: impl Into<String>,
model: Box<dyn Model>,
) {
self.registry.register(name, description, model);
}
pub fn register_defaults(&mut self) {
self.registry.register_defaults();
}
pub fn registry(&self) -> &BackendRegistry {
&self.registry
}
pub fn registry_mut(&mut self) -> &mut BackendRegistry {
&mut self.registry
}
pub fn backend_count(&self) -> usize {
self.registry.len()
}
pub fn run_synthetic(&self) -> Result<EvalResults> {
if self.registry.is_empty() {
return Err(Error::InvalidInput(
"No backends registered for evaluation".to_string(),
));
}
let all_examples = all_datasets();
let test_cases: Vec<_> = all_examples
.iter()
.filter(|ex| !ex.text.is_empty())
.take(if self.config.max_examples_per_dataset > 0 {
self.config.max_examples_per_dataset
} else {
usize::MAX
})
.map(|ex| (ex.text.clone(), ex.entities.clone()))
.collect();
let dataset_stats = compute_dataset_stats(&all_examples);
let mut backends_results = Vec::new();
for (name, desc, model) in self.registry.iter() {
let result = self.evaluate_model_on_cases(model, name, "synthetic", &test_cases)?;
backends_results.push((name.to_string(), desc.to_string(), vec![result]));
}
let backends = backends_results
.into_iter()
.map(|(name, desc, results)| aggregate_backend_results(&name, &desc, results))
.collect();
let by_difficulty = if self.config.breakdown_by_difficulty {
Some(self.compute_difficulty_breakdown()?)
} else {
None
};
let by_domain = if self.config.breakdown_by_domain {
Some(self.compute_domain_breakdown()?)
} else {
None
};
Ok(EvalResults {
timestamp: chrono::Utc::now().to_rfc3339(),
config: self.config.clone(),
backends,
by_difficulty,
by_domain,
dataset_stats,
})
}
#[cfg(feature = "eval")]
pub fn run_real_datasets(&self, datasets: &[DatasetId]) -> Result<EvalResults> {
if self.registry.is_empty() {
return Err(Error::InvalidInput(
"No backends registered for evaluation".to_string(),
));
}
let loader = self
.loader
.as_ref()
.ok_or_else(|| Error::InvalidInput("Dataset loader not initialized".to_string()))?;
let mut all_test_cases: Vec<(String, Vec<GoldEntity>)> = Vec::new();
let mut dataset_results: HashMap<String, Vec<(String, Vec<GoldEntity>)>> = HashMap::new();
for dataset_id in datasets {
let loadable = match crate::eval::LoadableDatasetId::try_from(*dataset_id) {
Ok(id) => id,
Err(e) => {
log::warn!("Skipping {} (not loadable): {}", dataset_id.name(), e);
continue;
}
};
match loader.load_or_download(loadable) {
Ok(loaded) => {
let cases = loaded.to_test_cases();
let limited: Vec<_> = if self.config.max_examples_per_dataset > 0 {
cases
.into_iter()
.take(self.config.max_examples_per_dataset)
.collect()
} else {
cases
};
dataset_results.insert(dataset_id.name().to_string(), limited.clone());
all_test_cases.extend(limited);
}
Err(e) => {
log::warn!("Failed to load {}: {}", dataset_id.name(), e);
}
}
}
if all_test_cases.is_empty() {
return Err(Error::InvalidInput("No datasets loaded".to_string()));
}
let mut backends_results = Vec::new();
for (name, desc, model) in self.registry.iter() {
let mut per_dataset_results = Vec::new();
for (dataset_name, cases) in &dataset_results {
let result = self.evaluate_model_on_cases(model, name, dataset_name, cases)?;
per_dataset_results.push(result);
}
backends_results.push((name.to_string(), desc.to_string(), per_dataset_results));
}
let backends = backends_results
.into_iter()
.map(|(name, desc, results)| aggregate_backend_results(&name, &desc, results))
.collect();
Ok(EvalResults {
timestamp: chrono::Utc::now().to_rfc3339(),
config: self.config.clone(),
backends,
by_difficulty: None,
by_domain: None,
dataset_stats: DatasetStatsSummary::default(),
})
}
pub fn run_cached_datasets(&self, datasets: &[DatasetId]) -> Result<EvalResults> {
if self.registry.is_empty() {
return Err(Error::InvalidInput(
"No backends registered for evaluation".to_string(),
));
}
let loader = self
.loader
.as_ref()
.ok_or_else(|| Error::InvalidInput("Dataset loader not initialized".to_string()))?;
let mut all_test_cases: Vec<(String, Vec<GoldEntity>)> = Vec::new();
let mut dataset_results: HashMap<String, Vec<(String, Vec<GoldEntity>)>> = HashMap::new();
for dataset_id in datasets {
let loadable = match crate::eval::LoadableDatasetId::try_from(*dataset_id) {
Ok(id) => id,
Err(_) => continue,
};
if loader.is_cached(loadable) {
match loader.load(loadable) {
Ok(loaded) => {
let cases = loaded.to_test_cases();
let limited: Vec<_> = if self.config.max_examples_per_dataset > 0 {
cases
.into_iter()
.take(self.config.max_examples_per_dataset)
.collect()
} else {
cases
};
dataset_results.insert(dataset_id.name().to_string(), limited.clone());
all_test_cases.extend(limited);
}
Err(e) => {
log::warn!("Failed to load cached {}: {}", dataset_id.name(), e);
}
}
}
}
if all_test_cases.is_empty() {
return Err(Error::InvalidInput(
"No cached datasets available".to_string(),
));
}
let mut backends_results = Vec::new();
for (name, desc, model) in self.registry.iter() {
let mut per_dataset_results = Vec::new();
for (dataset_name, cases) in &dataset_results {
let result = self.evaluate_model_on_cases(model, name, dataset_name, cases)?;
per_dataset_results.push(result);
}
backends_results.push((name.to_string(), desc.to_string(), per_dataset_results));
}
let backends = backends_results
.into_iter()
.map(|(name, desc, results)| aggregate_backend_results(&name, &desc, results))
.collect();
Ok(EvalResults {
timestamp: chrono::Utc::now().to_rfc3339(),
config: self.config.clone(),
backends,
by_difficulty: None,
by_domain: None,
dataset_stats: DatasetStatsSummary::default(),
})
}
fn evaluate_model_on_cases(
&self,
model: &dyn Model,
backend_name: &str,
dataset_name: &str,
test_cases: &[(String, Vec<GoldEntity>)],
) -> Result<BackendDatasetResult> {
if self.config.warmup && !test_cases.is_empty() {
for _ in 0..self.config.warmup_iterations {
let _ = model.extract_entities(&test_cases[0].0, None);
}
}
let start = Instant::now();
let results = evaluate_ner_model(model, test_cases)?;
let duration = start.elapsed();
let total_gold: usize = test_cases.iter().map(|(_, gold)| gold.len()).sum();
Ok(BackendDatasetResult {
backend_name: backend_name.to_string(),
dataset_name: dataset_name.to_string(),
num_examples: test_cases.len(),
num_gold_entities: total_gold,
precision: results.precision,
recall: results.recall,
f1: results.f1,
macro_f1: results.macro_f1,
found: results.found,
expected: results.expected,
per_type: results.per_type,
duration_ms: duration.as_secs_f64() * 1000.0,
tokens_per_second: results.tokens_per_second,
})
}
fn compute_difficulty_breakdown(&self) -> Result<HashMap<String, Vec<BackendDatasetResult>>> {
let difficulties = [
Difficulty::Easy,
Difficulty::Medium,
Difficulty::Hard,
Difficulty::Adversarial,
];
let mut breakdown = HashMap::new();
for difficulty in difficulties {
let subset: Vec<_> = datasets_by_difficulty(difficulty)
.into_iter()
.filter(|ex| !ex.text.is_empty())
.map(|ex| (ex.text, ex.entities))
.collect();
if subset.is_empty() {
continue;
}
let difficulty_name = format!("{:?}", difficulty);
let mut difficulty_results = Vec::new();
for (name, _desc, model) in self.registry.iter() {
let result =
self.evaluate_model_on_cases(model, name, &difficulty_name, &subset)?;
difficulty_results.push(result);
}
breakdown.insert(difficulty_name, difficulty_results);
}
Ok(breakdown)
}
fn compute_domain_breakdown(&self) -> Result<HashMap<String, Vec<BackendDatasetResult>>> {
let domains = [
Domain::News,
Domain::Financial,
Domain::Technical,
Domain::Sports,
Domain::Entertainment,
Domain::Politics,
Domain::Ecommerce,
Domain::Travel,
Domain::Weather,
Domain::Academic,
Domain::Historical,
Domain::Food,
Domain::RealEstate,
Domain::Conversational,
Domain::SocialMedia,
Domain::Biomedical,
Domain::Legal,
Domain::Scientific,
];
let mut breakdown = HashMap::new();
for domain in domains {
let subset: Vec<_> = datasets_by_domain(domain)
.into_iter()
.filter(|ex| !ex.text.is_empty())
.map(|ex| (ex.text, ex.entities))
.collect();
if subset.is_empty() {
continue;
}
let domain_name = format!("{:?}", domain);
let mut domain_results = Vec::new();
for (name, _desc, model) in self.registry.iter() {
let result = self.evaluate_model_on_cases(model, name, &domain_name, &subset)?;
domain_results.push(result);
}
breakdown.insert(domain_name, domain_results);
}
Ok(breakdown)
}
}
impl EvalResults {
pub fn to_html(&self) -> String {
let mut html = String::new();
html.push_str(HTML_HEAD);
html.push_str("<body>\n");
html.push_str("<div class=\"container\">\n");
html.push_str("<h1>NER Evaluation Report</h1>\n");
html.push_str(&format!(
"<p class=\"timestamp\">Generated: {}</p>\n",
self.timestamp
));
html.push_str("<h2>Dataset Summary</h2>\n");
html.push_str("<div class=\"stats-grid\">\n");
html.push_str(&format!(
"<div class=\"stat-box\"><span class=\"stat-value\">{}</span><span class=\"stat-label\">Examples</span></div>\n",
self.dataset_stats.total_examples
));
html.push_str(&format!(
"<div class=\"stat-box\"><span class=\"stat-value\">{}</span><span class=\"stat-label\">Entities</span></div>\n",
self.dataset_stats.total_entities
));
html.push_str(&format!(
"<div class=\"stat-box\"><span class=\"stat-value\">{}</span><span class=\"stat-label\">Backends</span></div>\n",
self.backends.len()
));
html.push_str("</div>\n");
html.push_str("<h2>Overall Results</h2>\n");
html.push_str("<table>\n");
html.push_str("<thead><tr><th>Backend</th><th>F1</th><th>Precision</th><th>Recall</th><th>Found/Expected</th><th>Time</th></tr></thead>\n");
html.push_str("<tbody>\n");
for backend in &self.backends {
let f1_class = if backend.f1.mean > 0.8 {
"good"
} else if backend.f1.mean > 0.5 {
"ok"
} else {
"poor"
};
html.push_str(&format!(
"<tr><td><strong>{}</strong><br><small>{}</small></td>\
<td class=\"{}\"><strong>{:.1}%</strong><br><small>{}</small></td>\
<td>{:.1}%</td>\
<td>{:.1}%</td>\
<td>{} / {}</td>\
<td>{:.1}ms</td></tr>\n",
backend.backend_name,
backend.description,
f1_class,
backend.f1.mean * 100.0,
backend.f1.format_with_ci(),
backend.precision.mean * 100.0,
backend.recall.mean * 100.0,
backend.total_found,
backend.total_expected,
backend.total_duration_ms,
));
}
html.push_str("</tbody></table>\n");
if let Some(ref by_diff) = self.by_difficulty {
html.push_str("<h2>Results by Difficulty</h2>\n");
html.push_str(&self.render_breakdown_table(by_diff));
}
if let Some(ref by_dom) = self.by_domain {
html.push_str("<h2>Results by Domain</h2>\n");
html.push_str(&self.render_breakdown_table(by_dom));
}
if let Some(best) = self.backends.iter().max_by(|a, b| {
a.f1.mean
.partial_cmp(&b.f1.mean)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
if !best.per_dataset.is_empty() {
html.push_str(&format!(
"<h2>Per-Type Metrics ({})</h2>\n",
best.backend_name
));
let mut type_metrics: HashMap<String, Vec<&TypeMetrics>> = HashMap::new();
for ds_result in &best.per_dataset {
for (type_name, metrics) in &ds_result.per_type {
type_metrics
.entry(type_name.clone())
.or_default()
.push(metrics);
}
}
html.push_str("<table>\n");
html.push_str("<thead><tr><th>Type</th><th>F1</th><th>Precision</th><th>Recall</th><th>Correct/Expected</th></tr></thead>\n");
html.push_str("<tbody>\n");
let mut sorted_types: Vec<_> = type_metrics.iter().collect();
sorted_types.sort_by(|a, b| {
let avg_f1_a = a.1.iter().map(|m| m.f1).sum::<f64>() / a.1.len() as f64;
let avg_f1_b = b.1.iter().map(|m| m.f1).sum::<f64>() / b.1.len() as f64;
avg_f1_b
.partial_cmp(&avg_f1_a)
.unwrap_or(std::cmp::Ordering::Equal)
});
for (type_name, metrics_list) in sorted_types {
let avg_f1 =
metrics_list.iter().map(|m| m.f1).sum::<f64>() / metrics_list.len() as f64;
let avg_p = metrics_list.iter().map(|m| m.precision).sum::<f64>()
/ metrics_list.len() as f64;
let avg_r = metrics_list.iter().map(|m| m.recall).sum::<f64>()
/ metrics_list.len() as f64;
let total_correct: usize = metrics_list.iter().map(|m| m.correct).sum();
let total_expected: usize = metrics_list.iter().map(|m| m.expected).sum();
let f1_class = if avg_f1 > 0.8 {
"good"
} else if avg_f1 > 0.5 {
"ok"
} else {
"poor"
};
html.push_str(&format!(
"<tr><td>{}</td><td class=\"{}\">{:.1}%</td><td>{:.1}%</td><td>{:.1}%</td><td>{}/{}</td></tr>\n",
type_name,
f1_class,
avg_f1 * 100.0,
avg_p * 100.0,
avg_r * 100.0,
total_correct,
total_expected,
));
}
html.push_str("</tbody></table>\n");
}
}
if !self.dataset_stats.entity_type_distribution.is_empty() {
html.push_str("<h2>Entity Type Distribution</h2>\n");
html.push_str("<table>\n");
html.push_str("<thead><tr><th>Type</th><th>Count</th><th>Percent</th></tr></thead>\n");
html.push_str("<tbody>\n");
let total: usize = self.dataset_stats.entity_type_distribution.values().sum();
let mut sorted: Vec<_> = self.dataset_stats.entity_type_distribution.iter().collect();
sorted.sort_by(|a, b| b.1.cmp(a.1));
for (type_name, count) in sorted {
let pct = (*count as f64 / total as f64) * 100.0;
html.push_str(&format!(
"<tr><td>{}</td><td>{}</td><td>{:.1}%</td></tr>\n",
type_name, count, pct
));
}
html.push_str("</tbody></table>\n");
}
html.push_str("</div>\n</body></html>\n");
html
}
fn render_breakdown_table(
&self,
breakdown: &HashMap<String, Vec<BackendDatasetResult>>,
) -> String {
let mut html = String::new();
html.push_str("<table>\n");
let backend_names: Vec<_> = breakdown
.values()
.next()
.map(|results| results.iter().map(|r| r.backend_name.as_str()).collect())
.unwrap_or_default();
html.push_str("<thead><tr><th>Category</th>");
for name in &backend_names {
html.push_str(&format!("<th>{}</th>", name));
}
html.push_str("</tr></thead>\n");
html.push_str("<tbody>\n");
let mut sorted_keys: Vec<_> = breakdown.keys().collect();
sorted_keys.sort();
for key in sorted_keys {
let results = &breakdown[key];
html.push_str(&format!("<tr><td>{}</td>", key));
for backend_name in &backend_names {
if let Some(result) = results.iter().find(|r| r.backend_name == *backend_name) {
let f1_class = if result.f1 > 0.8 {
"good"
} else if result.f1 > 0.5 {
"ok"
} else {
"poor"
};
html.push_str(&format!(
"<td class=\"{}\">{:.1}%</td>",
f1_class,
result.f1 * 100.0
));
} else {
html.push_str("<td>-</td>");
}
}
html.push_str("</tr>\n");
}
html.push_str("</tbody></table>\n");
html
}
}
fn compute_dataset_stats(examples: &[AnnotatedExample]) -> DatasetStatsSummary {
let mut entity_type_dist: HashMap<String, usize> = HashMap::new();
let mut domain_dist: HashMap<String, usize> = HashMap::new();
let mut difficulty_dist: HashMap<String, usize> = HashMap::new();
let mut total_entities = 0;
for ex in examples {
*domain_dist.entry(format!("{:?}", ex.domain)).or_insert(0) += 1;
*difficulty_dist
.entry(format!("{:?}", ex.difficulty))
.or_insert(0) += 1;
for entity in &ex.entities {
let type_name = format!("{:?}", entity.entity_type);
*entity_type_dist.entry(type_name).or_insert(0) += 1;
total_entities += 1;
}
}
DatasetStatsSummary {
total_examples: examples.len(),
total_entities,
entity_type_distribution: entity_type_dist,
domain_distribution: domain_dist,
difficulty_distribution: difficulty_dist,
}
}
fn aggregate_backend_results(
name: &str,
desc: &str,
results: Vec<BackendDatasetResult>,
) -> BackendAggregateResult {
if results.is_empty() {
return BackendAggregateResult {
backend_name: name.to_string(),
description: desc.to_string(),
f1: MetricWithVariance::default(),
precision: MetricWithVariance::default(),
recall: MetricWithVariance::default(),
total_examples: 0,
total_found: 0,
total_expected: 0,
total_duration_ms: 0.0,
per_dataset: vec![],
};
}
let f1s: Vec<f64> = results.iter().map(|r| r.f1).collect();
let precisions: Vec<f64> = results.iter().map(|r| r.precision).collect();
let recalls: Vec<f64> = results.iter().map(|r| r.recall).collect();
BackendAggregateResult {
backend_name: name.to_string(),
description: desc.to_string(),
f1: MetricWithVariance::from_samples(&f1s),
precision: MetricWithVariance::from_samples(&precisions),
recall: MetricWithVariance::from_samples(&recalls),
total_examples: results.iter().map(|r| r.num_examples).sum(),
total_found: results.iter().map(|r| r.found).sum(),
total_expected: results.iter().map(|r| r.expected).sum(),
total_duration_ms: results.iter().map(|r| r.duration_ms).sum(),
per_dataset: results,
}
}
const HTML_HEAD: &str = r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NER Evaluation Report</title>
<style>
:root {
--bg: #0d1117;
--fg: #c9d1d9;
--accent: #58a6ff;
--good: #3fb950;
--ok: #d29922;
--poor: #f85149;
--border: #30363d;
--surface: #161b22;
}
* { box-sizing: border-box; }
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
background: var(--bg);
color: var(--fg);
margin: 0;
padding: 0;
line-height: 1.6;
}
.container { max-width: 1200px; margin: 0 auto; padding: 2rem; }
h1 { color: var(--accent); border-bottom: 2px solid var(--border); padding-bottom: 0.5rem; }
h2 { color: var(--fg); margin-top: 2rem; }
.timestamp { color: #8b949e; font-size: 0.9rem; }
.stats-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin: 1rem 0; }
.stat-box {
background: var(--surface);
border: 1px solid var(--border);
border-radius: 8px;
padding: 1rem;
text-align: center;
}
.stat-value { display: block; font-size: 2rem; font-weight: bold; color: var(--accent); }
.stat-label { display: block; font-size: 0.8rem; color: #8b949e; text-transform: uppercase; }
table {
width: 100%;
border-collapse: collapse;
background: var(--surface);
border-radius: 8px;
overflow: hidden;
margin: 1rem 0;
}
th, td { padding: 0.75rem 1rem; text-align: left; border-bottom: 1px solid var(--border); }
th { background: #21262d; color: var(--fg); font-weight: 600; }
tr:hover { background: #21262d; }
.good { color: var(--good); }
.ok { color: var(--ok); }
.poor { color: var(--poor); }
small { color: #8b949e; }
</style>
</head>
"#;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_eval_config_default() {
let config = EvalConfig::default();
assert!(config.breakdown_by_difficulty);
assert!(config.breakdown_by_domain);
}
#[test]
fn test_backend_registry() {
let mut registry = BackendRegistry::new();
assert!(registry.is_empty());
registry.register_defaults();
assert!(!registry.is_empty());
assert!(
!registry.is_empty(),
"Expected at least 1 backend, got {}",
registry.len()
);
}
#[test]
fn test_synthetic_eval() {
let mut harness = EvalHarness::new(EvalConfig {
max_examples_per_dataset: 5,
breakdown_by_difficulty: false,
breakdown_by_domain: false,
breakdown_by_type: true,
warmup: false,
warmup_iterations: 0,
min_confidence: None,
cache_dir: None,
normalize_types: false,
})
.unwrap();
harness.registry.register_defaults();
let results = harness.run_synthetic();
assert!(results.is_ok());
let results = results.unwrap();
assert!(!results.backends.is_empty());
}
#[test]
fn test_html_generation() {
let mut harness = EvalHarness::new(EvalConfig {
max_examples_per_dataset: 5,
breakdown_by_difficulty: false,
breakdown_by_domain: false,
breakdown_by_type: true,
warmup: false,
warmup_iterations: 0,
min_confidence: None,
cache_dir: None,
normalize_types: false,
})
.unwrap();
harness.registry.register_defaults();
let results = harness.run_synthetic().unwrap();
let html = results.to_html();
assert!(html.contains("<html"));
assert!(html.contains("NER Evaluation Report"));
assert!(html.contains("RegexNER"));
}
}