use anno::{Error, Model, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MorphemeConfig {
pub boundary_char: char,
pub char_level_fallback: bool,
pub min_morpheme_len: usize,
}
impl Default for MorphemeConfig {
fn default() -> Self {
Self {
boundary_char: '-',
char_level_fallback: true,
min_morpheme_len: 1,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OrthographicConfig {
pub unicode_normalize: bool,
pub case_insensitive: bool,
pub ignore_diacritics: bool,
pub char_mappings: HashMap<char, char>,
}
impl Default for OrthographicConfig {
fn default() -> Self {
Self {
unicode_normalize: true,
case_insensitive: false,
ignore_diacritics: false,
char_mappings: HashMap::new(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LowResourceResults {
pub token_f1: f64,
pub morpheme_f1: Option<f64>,
pub char_f1: f64,
pub entity_density_ratio: f64,
pub transfer_efficiency: Option<f64>,
pub per_type: HashMap<String, TypeMetrics>,
pub normalization_impact: Option<NormalizationImpact>,
pub metadata: LowResourceMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TypeMetrics {
pub precision: f64,
pub recall: f64,
pub f1: f64,
pub support: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NormalizationImpact {
pub raw_f1: f64,
pub normalized_f1: f64,
pub improvement: f64,
pub entities_affected: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LowResourceMetadata {
pub language_code: String,
pub language_family: Option<String>,
pub is_polysynthetic: bool,
pub has_standard_orthography: bool,
pub speaker_population: Option<u64>,
pub endangerment_level: Option<EndangermentLevel>,
pub training_examples: Option<usize>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum EndangermentLevel {
Safe,
Vulnerable,
DefinitelyEndangered,
SeverelyEndangered,
CriticallyEndangered,
Extinct,
}
pub struct LowResourceEvaluator {
morpheme_config: Option<MorphemeConfig>,
orthographic_config: Option<OrthographicConfig>,
english_baseline_f1: Option<f64>,
}
impl LowResourceEvaluator {
pub fn new() -> Self {
Self {
morpheme_config: None,
orthographic_config: None,
english_baseline_f1: None,
}
}
pub fn with_morpheme_boundaries(mut self, config: MorphemeConfig) -> Self {
self.morpheme_config = Some(config);
self
}
pub fn with_orthographic_normalization(mut self, config: OrthographicConfig) -> Self {
self.orthographic_config = Some(config);
self
}
pub fn with_english_baseline(mut self, f1: f64) -> Self {
self.english_baseline_f1 = Some(f1);
self
}
pub fn evaluate(
&self,
model: &dyn Model,
test_cases: &[(String, Vec<super::GoldEntity>)],
metadata: LowResourceMetadata,
) -> Result<LowResourceResults> {
if test_cases.is_empty() {
return Err(Error::InvalidInput("Empty test cases".to_string()));
}
let standard_results = super::evaluate_ner_model(model, test_cases)?;
let char_f1 = self.calculate_char_f1(model, test_cases)?;
let morpheme_f1 = if self.morpheme_config.is_some() && metadata.is_polysynthetic {
Some(self.calculate_morpheme_f1(model, test_cases)?)
} else {
None
};
let total_chars: usize = test_cases.iter().map(|(text, _)| text.len()).sum();
let total_entities: usize = test_cases.iter().map(|(_, entities)| entities.len()).sum();
let entity_density = if total_chars > 0 {
total_entities as f64 / total_chars as f64
} else {
0.0
};
let english_baseline_density = 0.05;
let entity_density_ratio = entity_density / english_baseline_density;
let transfer_efficiency = self
.english_baseline_f1
.map(|baseline| standard_results.f1 / baseline);
let normalization_impact = if self.orthographic_config.is_some() {
Some(self.calculate_normalization_impact(model, test_cases)?)
} else {
None
};
let per_type: HashMap<String, TypeMetrics> = standard_results
.per_type
.into_iter()
.map(|(k, v)| {
(
k,
TypeMetrics {
precision: v.precision,
recall: v.recall,
f1: v.f1,
support: v.expected,
},
)
})
.collect();
Ok(LowResourceResults {
token_f1: standard_results.f1,
morpheme_f1,
char_f1,
entity_density_ratio,
transfer_efficiency,
per_type,
normalization_impact,
metadata,
})
}
fn calculate_char_f1(
&self,
model: &dyn Model,
test_cases: &[(String, Vec<super::GoldEntity>)],
) -> Result<f64> {
let mut total_gold_chars = 0;
let mut total_pred_chars = 0;
let mut total_correct_chars = 0;
for (text, gold_entities) in test_cases {
let predictions = model.extract_entities(text, None)?;
let text_char_len = text.chars().count();
let mut gold_mask = vec![false; text_char_len];
for entity in gold_entities {
let start = entity.start.min(text_char_len);
let end = entity.end.min(text_char_len);
for slot in gold_mask.iter_mut().take(end).skip(start) {
*slot = true;
}
}
let mut pred_mask = vec![false; text_char_len];
for entity in &predictions {
let start = entity.start().min(text_char_len);
let end = entity.end().min(text_char_len);
for slot in pred_mask.iter_mut().take(end).skip(start) {
*slot = true;
}
}
for i in 0..text_char_len {
if gold_mask[i] {
total_gold_chars += 1;
}
if pred_mask[i] {
total_pred_chars += 1;
}
if gold_mask[i] && pred_mask[i] {
total_correct_chars += 1;
}
}
}
let precision = if total_pred_chars > 0 {
total_correct_chars as f64 / total_pred_chars as f64
} else {
0.0
};
let recall = if total_gold_chars > 0 {
total_correct_chars as f64 / total_gold_chars as f64
} else {
0.0
};
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
Ok(f1)
}
fn calculate_morpheme_f1(
&self,
model: &dyn Model,
test_cases: &[(String, Vec<super::GoldEntity>)],
) -> Result<f64> {
let config = self.morpheme_config.as_ref().ok_or_else(|| {
Error::evaluation(
"morpheme-level evaluation requested without MorphemeConfig (call with_morpheme_boundaries(true))",
)
})?;
let mut total_gold_morphemes = 0;
let mut total_pred_morphemes = 0;
let mut total_correct_morphemes = 0;
for (text, gold_entities) in test_cases {
let predictions = model.extract_entities(text, None)?;
for entity in gold_entities {
let morpheme_count = entity
.text
.split(config.boundary_char)
.filter(|m| m.len() >= config.min_morpheme_len)
.count()
.max(1);
total_gold_morphemes += morpheme_count;
}
for entity in &predictions {
let char_count = text.chars().count();
let entity_text: String = text
.chars()
.skip(entity.start())
.take(entity.end().min(char_count).saturating_sub(entity.start()))
.collect();
let morpheme_count = entity_text
.split(config.boundary_char)
.filter(|m| m.len() >= config.min_morpheme_len)
.count()
.max(1);
total_pred_morphemes += morpheme_count;
for gold in gold_entities {
if entity.start() == gold.start && entity.end() == gold.end {
total_correct_morphemes += morpheme_count;
break;
}
}
}
}
let precision = if total_pred_morphemes > 0 {
total_correct_morphemes as f64 / total_pred_morphemes as f64
} else {
0.0
};
let recall = if total_gold_morphemes > 0 {
total_correct_morphemes as f64 / total_gold_morphemes as f64
} else {
0.0
};
let f1 = if precision + recall > 0.0 {
2.0 * precision * recall / (precision + recall)
} else {
0.0
};
Ok(f1)
}
fn calculate_normalization_impact(
&self,
model: &dyn Model,
test_cases: &[(String, Vec<super::GoldEntity>)],
) -> Result<NormalizationImpact> {
let config = self.orthographic_config.as_ref().ok_or_else(|| {
Error::evaluation(
"normalization impact requested without OrthographicConfig (call with_orthographic_normalization(true))",
)
})?;
let raw_results = super::evaluate_ner_model(model, test_cases)?;
let normalized_cases: Vec<(String, Vec<super::GoldEntity>)> = test_cases
.iter()
.map(|(text, entities)| {
let normalized_text = self.normalize_text(text, config);
let normalized_entities: Vec<super::GoldEntity> = entities
.iter()
.map(|e| super::GoldEntity {
text: self.normalize_text(&e.text, config),
entity_type: e.entity_type.clone(),
original_label: e.original_label.clone(),
start: e.start,
end: e.end,
})
.collect();
(normalized_text, normalized_entities)
})
.collect();
let normalized_results = super::evaluate_ner_model(model, &normalized_cases)?;
let mut entities_affected = 0;
for ((orig_text, _), (norm_text, _)) in test_cases.iter().zip(normalized_cases.iter()) {
if orig_text != norm_text {
entities_affected += 1;
}
}
Ok(NormalizationImpact {
raw_f1: raw_results.f1,
normalized_f1: normalized_results.f1,
improvement: normalized_results.f1 - raw_results.f1,
entities_affected,
})
}
fn normalize_text(&self, text: &str, config: &OrthographicConfig) -> String {
let mut result = text.to_string();
if config.unicode_normalize {
use unicode_normalization::UnicodeNormalization;
result = result.nfc().collect();
}
if config.case_insensitive {
result = result.to_lowercase();
}
if config.ignore_diacritics {
result = remove_diacritics(&result);
}
for (from, to) in &config.char_mappings {
result = result.replace(*from, &to.to_string());
}
result
}
}
impl Default for LowResourceEvaluator {
fn default() -> Self {
Self::new()
}
}
fn remove_diacritics(text: &str) -> String {
use unicode_normalization::UnicodeNormalization;
text.nfd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.collect()
}
pub fn language_metadata(language_code: &str) -> Option<LowResourceMetadata> {
match language_code {
"qxo" => Some(LowResourceMetadata {
language_code: "qxo".to_string(),
language_family: Some("Quechuan".to_string()),
is_polysynthetic: false, has_standard_orthography: false,
speaker_population: Some(200_000),
endangerment_level: Some(EndangermentLevel::Vulnerable),
training_examples: Some(12), }),
"chr" => Some(LowResourceMetadata {
language_code: "chr".to_string(),
language_family: Some("Iroquoian".to_string()),
is_polysynthetic: true,
has_standard_orthography: true, speaker_population: Some(2_000),
endangerment_level: Some(EndangermentLevel::SeverelyEndangered),
training_examples: None,
}),
"nav" => Some(LowResourceMetadata {
language_code: "nav".to_string(),
language_family: Some("Na-Dené".to_string()),
is_polysynthetic: true,
has_standard_orthography: true,
speaker_population: Some(170_000),
endangerment_level: Some(EndangermentLevel::Vulnerable),
training_examples: None,
}),
"gn" | "grn" => Some(LowResourceMetadata {
language_code: "grn".to_string(),
language_family: Some("Tupian".to_string()),
is_polysynthetic: false,
has_standard_orthography: true,
speaker_population: Some(6_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: None,
}),
"nah" => Some(LowResourceMetadata {
language_code: "nah".to_string(),
language_family: Some("Uto-Aztecan".to_string()),
is_polysynthetic: true,
has_standard_orthography: false, speaker_population: Some(1_700_000),
endangerment_level: Some(EndangermentLevel::Vulnerable),
training_examples: None,
}),
"shp" => Some(LowResourceMetadata {
language_code: "shp".to_string(),
language_family: Some("Panoan".to_string()),
is_polysynthetic: false,
has_standard_orthography: true,
speaker_population: Some(35_000),
endangerment_level: Some(EndangermentLevel::Vulnerable),
training_examples: None,
}),
"sw" | "swa" => Some(LowResourceMetadata {
language_code: "swa".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(100_000_000), endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(9_418), }),
"yo" | "yor" => Some(LowResourceMetadata {
language_code: "yor".to_string(),
language_family: Some("Atlantic-Congo (Volta-Niger)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(45_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(9_824), }),
"ha" | "hau" => Some(LowResourceMetadata {
language_code: "hau".to_string(),
language_family: Some("Afro-Asiatic (Chadic)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(80_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(8_165), }),
"am" | "amh" => Some(LowResourceMetadata {
language_code: "amh".to_string(),
language_family: Some("Afro-Asiatic (Semitic)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(57_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(1_750), }),
"ig" | "ibo" => Some(LowResourceMetadata {
language_code: "ibo".to_string(),
language_family: Some("Atlantic-Congo (Volta-Niger)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(45_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(10_905), }),
"rw" | "kin" => Some(LowResourceMetadata {
language_code: "kin".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(12_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(11_178), }),
"pcm" => Some(LowResourceMetadata {
language_code: "pcm".to_string(),
language_family: Some("English Creole".to_string()),
is_polysynthetic: false,
has_standard_orthography: false, speaker_population: Some(100_000_000), endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(7_746), }),
"wo" | "wol" => Some(LowResourceMetadata {
language_code: "wol".to_string(),
language_family: Some("Atlantic-Congo (Atlantic)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true,
speaker_population: Some(12_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(6_561), }),
"zu" | "zul" => Some(LowResourceMetadata {
language_code: "zul".to_string(),
language_family: Some("Atlantic-Congo (Bantu/Nguni)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(27_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(8_354), }),
"xh" | "xho" => Some(LowResourceMetadata {
language_code: "xho".to_string(),
language_family: Some("Atlantic-Congo (Bantu/Nguni)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(19_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(8_168), }),
"lg" | "lug" => Some(LowResourceMetadata {
language_code: "lug".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(10_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(7_060), }),
"luo" => Some(LowResourceMetadata {
language_code: "luo".to_string(),
language_family: Some("Nilo-Saharan (Nilotic)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true,
speaker_population: Some(6_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(7_372), }),
"tw" | "twi" | "aka" => Some(LowResourceMetadata {
language_code: "twi".to_string(),
language_family: Some("Atlantic-Congo (Kwa)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(11_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(6_056), }),
"sn" | "sna" => Some(LowResourceMetadata {
language_code: "sna".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(15_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(8_867), }),
"ti" | "tir" => Some(LowResourceMetadata {
language_code: "tir".to_string(),
language_family: Some("Afro-Asiatic (Semitic)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(9_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: None, }),
"bm" | "bam" => Some(LowResourceMetadata {
language_code: "bam".to_string(),
language_family: Some("Atlantic-Congo (Mande)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(14_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(6_375), }),
"ee" | "ewe" => Some(LowResourceMetadata {
language_code: "ewe".to_string(),
language_family: Some("Atlantic-Congo (Kwa)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(7_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(5_007), }),
"fon" => Some(LowResourceMetadata {
language_code: "fon".to_string(),
language_family: Some("Atlantic-Congo (Kwa)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true, speaker_population: Some(2_200_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(6_204), }),
"mos" => Some(LowResourceMetadata {
language_code: "mos".to_string(),
language_family: Some("Atlantic-Congo (Gur)".to_string()),
is_polysynthetic: false,
has_standard_orthography: true,
speaker_population: Some(8_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(6_793), }),
"tn" | "tsn" => Some(LowResourceMetadata {
language_code: "tsn".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(8_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(4_784), }),
"ny" | "nya" => Some(LowResourceMetadata {
language_code: "nya".to_string(),
language_family: Some("Atlantic-Congo (Bantu)".to_string()),
is_polysynthetic: false, has_standard_orthography: true,
speaker_population: Some(15_000_000),
endangerment_level: Some(EndangermentLevel::Safe),
training_examples: Some(8_928), }),
"bbj" => Some(LowResourceMetadata {
language_code: "bbj".to_string(),
language_family: Some("Atlantic-Congo (Grassfields Bantu)".to_string()),
is_polysynthetic: false,
has_standard_orthography: false, speaker_population: Some(1_000_000),
endangerment_level: Some(EndangermentLevel::Vulnerable),
training_examples: Some(4_833), }),
_ => None,
}
}
pub const MASAKHANER2_LANGUAGES: &[(&str, &str)] = &[
("bam", "Bambara"),
("bbj", "Ghomala"),
("ewe", "Ewe"),
("fon", "Fon"),
("hau", "Hausa"),
("ibo", "Igbo"),
("kin", "Kinyarwanda"),
("lug", "Luganda"),
("luo", "Dholuo"),
("mos", "Mossi"),
("nya", "Chichewa"),
("pcm", "Nigerian Pidgin"),
("sna", "Shona"),
("swa", "Swahili"),
("tsn", "Setswana"),
("twi", "Twi"),
("wol", "Wolof"),
("xho", "Xhosa"),
("yor", "Yoruba"),
("zul", "Zulu"),
];
pub fn masakhaner2_language_name(code: &str) -> Option<&'static str> {
MASAKHANER2_LANGUAGES
.iter()
.find(|(c, _)| *c == code)
.map(|(_, name)| *name)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_language_metadata() {
let quechua = language_metadata("qxo").unwrap();
assert_eq!(quechua.language_family, Some("Quechuan".to_string()));
assert!(!quechua.is_polysynthetic);
let cherokee = language_metadata("chr").unwrap();
assert!(cherokee.is_polysynthetic);
assert_eq!(
cherokee.endangerment_level,
Some(EndangermentLevel::SeverelyEndangered)
);
}
#[test]
fn test_orthographic_normalization() {
let config = OrthographicConfig {
unicode_normalize: true,
case_insensitive: true,
ignore_diacritics: true,
char_mappings: HashMap::new(),
};
let evaluator = LowResourceEvaluator::new();
let normalized = evaluator.normalize_text("Café", &config);
assert_eq!(normalized, "cafe");
}
#[test]
fn test_evaluator_creation() {
let evaluator = LowResourceEvaluator::new()
.with_morpheme_boundaries(MorphemeConfig::default())
.with_orthographic_normalization(OrthographicConfig::default())
.with_english_baseline(0.92);
assert!(evaluator.morpheme_config.is_some());
assert!(evaluator.orthographic_config.is_some());
assert_eq!(evaluator.english_baseline_f1, Some(0.92));
}
}