anno-eval 0.10.0

Evaluation harnesses, datasets, and muxer-backed sampling for anno
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
//! Low-resource and morphologically complex language evaluation metrics.
//!
//! This module provides specialized evaluation tools for:
//! - Indigenous/Native American languages (Quechua, Cherokee, Navajo, etc.)
//! - Polysynthetic languages with complex morphology
//! - Languages with orthographic variation
//! - Low-resource scenarios with limited training data
//!
//! # Key Metrics
//!
//! - **Morpheme-level F1**: Evaluation at morpheme boundaries (important for polysynthetic languages)
//! - **Character-level F1**: Robust to tokenization differences
//! - **Normalized Entity Ratio**: Compares entity density across languages
//! - **Transfer Efficiency**: Measures how well high-resource models transfer
//! - **Orthographic Robustness**: Handles spelling variations
//!
//! # Example
//!
//! ```rust,ignore
//! use anno_eval::eval::low_resource::{LowResourceEvaluator, MorphemeConfig};
//!
//! let evaluator = LowResourceEvaluator::new()
//!     .with_morpheme_boundaries(true)
//!     .with_orthographic_normalization(true);
//!
//! let results = evaluator.evaluate(&model, &quechua_dataset)?;
//! println!("Morpheme F1: {:.3}", results.morpheme_f1);
//! println!("Transfer efficiency: {:.3}", results.transfer_efficiency);
//! ```
//!
//! # References
//!
//! - qxoRef: Galarreta et al., AmericasNLP 2021 (Quechua coreference)
//! - AmericasNLI: Ebrahimi et al., EMNLP 2022 (Indigenous NLI)
//! - CorefUD 1.3: Nedoluzhko et al., 2022 (Multilingual coreference)

use anno::{Error, Model, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Configuration for morpheme-level evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MorphemeConfig {
    /// Character used as morpheme boundary marker
    pub boundary_char: char,
    /// Whether to use character-level fallback when morpheme boundaries unavailable
    pub char_level_fallback: bool,
    /// Minimum morpheme length to count
    pub min_morpheme_len: usize,
}

impl Default for MorphemeConfig {
    fn default() -> Self {
        Self {
            boundary_char: '-',
            char_level_fallback: true,
            min_morpheme_len: 1,
        }
    }
}

/// Configuration for orthographic normalization.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OrthographicConfig {
    /// Enable Unicode normalization (NFC)
    pub unicode_normalize: bool,
    /// Case-insensitive matching
    pub case_insensitive: bool,
    /// Diacritic-insensitive matching
    pub ignore_diacritics: bool,
    /// Custom character mappings (e.g., for non-standard orthographies)
    pub char_mappings: HashMap<char, char>,
}

impl Default for OrthographicConfig {
    fn default() -> Self {
        Self {
            unicode_normalize: true,
            case_insensitive: false,
            ignore_diacritics: false,
            char_mappings: HashMap::new(),
        }
    }
}

/// Results from low-resource language evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LowResourceResults {
    /// Standard token-level F1
    pub token_f1: f64,
    /// Morpheme-level F1 (for polysynthetic languages)
    pub morpheme_f1: Option<f64>,
    /// Character-level F1 (robust to tokenization)
    pub char_f1: f64,
    /// Entity density ratio compared to English baseline
    pub entity_density_ratio: f64,
    /// Transfer efficiency (F1 / English F1 baseline)
    pub transfer_efficiency: Option<f64>,
    /// Per-entity-type breakdown
    pub per_type: HashMap<String, TypeMetrics>,
    /// Orthographic normalization impact
    pub normalization_impact: Option<NormalizationImpact>,
    /// Language-specific metadata
    pub metadata: LowResourceMetadata,
}

/// Per-type metrics for low-resource evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TypeMetrics {
    /// Precision score (0-1)
    pub precision: f64,
    /// Recall score (0-1)
    pub recall: f64,
    /// F1 score (0-1)
    pub f1: f64,
    /// Number of examples for this type
    pub support: usize,
}

/// Impact of orthographic normalization on results.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NormalizationImpact {
    /// F1 without normalization
    pub raw_f1: f64,
    /// F1 with normalization
    pub normalized_f1: f64,
    /// Improvement from normalization
    pub improvement: f64,
    /// Number of entities affected
    pub entities_affected: usize,
}

/// Metadata about the low-resource evaluation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LowResourceMetadata {
    /// ISO 639-3 language code
    pub language_code: String,
    /// Language family
    pub language_family: Option<String>,
    /// Whether language is polysynthetic
    pub is_polysynthetic: bool,
    /// Whether language has standardized orthography
    pub has_standard_orthography: bool,
    /// Estimated speaker population
    pub speaker_population: Option<u64>,
    /// UNESCO endangerment level
    pub endangerment_level: Option<EndangermentLevel>,
    /// Number of training examples available
    pub training_examples: Option<usize>,
}

/// UNESCO language endangerment levels.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum EndangermentLevel {
    /// Language is used by all ages
    Safe,
    /// Most children speak the language
    Vulnerable,
    /// Children speak at home but not in school
    DefinitelyEndangered,
    /// Only spoken by grandparents
    SeverelyEndangered,
    /// Only spoken by a few elderly
    CriticallyEndangered,
    /// No living speakers
    Extinct,
}

/// Evaluator for low-resource and morphologically complex languages.
pub struct LowResourceEvaluator {
    morpheme_config: Option<MorphemeConfig>,
    orthographic_config: Option<OrthographicConfig>,
    english_baseline_f1: Option<f64>,
}

impl LowResourceEvaluator {
    /// Create a new low-resource evaluator with default settings.
    pub fn new() -> Self {
        Self {
            morpheme_config: None,
            orthographic_config: None,
            english_baseline_f1: None,
        }
    }

    /// Enable morpheme-level evaluation.
    pub fn with_morpheme_boundaries(mut self, config: MorphemeConfig) -> Self {
        self.morpheme_config = Some(config);
        self
    }

    /// Enable orthographic normalization.
    pub fn with_orthographic_normalization(mut self, config: OrthographicConfig) -> Self {
        self.orthographic_config = Some(config);
        self
    }

    /// Set English baseline F1 for transfer efficiency calculation.
    pub fn with_english_baseline(mut self, f1: f64) -> Self {
        self.english_baseline_f1 = Some(f1);
        self
    }

    /// Evaluate model on low-resource dataset.
    pub fn evaluate(
        &self,
        model: &dyn Model,
        test_cases: &[(String, Vec<super::GoldEntity>)],
        metadata: LowResourceMetadata,
    ) -> Result<LowResourceResults> {
        if test_cases.is_empty() {
            return Err(Error::InvalidInput("Empty test cases".to_string()));
        }

        // Calculate standard token-level metrics
        let standard_results = super::evaluate_ner_model(model, test_cases)?;

        // Calculate character-level F1
        let char_f1 = self.calculate_char_f1(model, test_cases)?;

        // Calculate morpheme-level F1 if configured
        let morpheme_f1 = if self.morpheme_config.is_some() && metadata.is_polysynthetic {
            Some(self.calculate_morpheme_f1(model, test_cases)?)
        } else {
            None
        };

        // Calculate entity density ratio
        let total_chars: usize = test_cases.iter().map(|(text, _)| text.len()).sum();
        let total_entities: usize = test_cases.iter().map(|(_, entities)| entities.len()).sum();
        let entity_density = if total_chars > 0 {
            total_entities as f64 / total_chars as f64
        } else {
            0.0
        };
        // English baseline entity density (approximate from CoNLL-2003)
        let english_baseline_density = 0.05;
        let entity_density_ratio = entity_density / english_baseline_density;

        // Calculate transfer efficiency
        let transfer_efficiency = self
            .english_baseline_f1
            .map(|baseline| standard_results.f1 / baseline);

        // Calculate normalization impact if configured
        let normalization_impact = if self.orthographic_config.is_some() {
            Some(self.calculate_normalization_impact(model, test_cases)?)
        } else {
            None
        };

        // Convert per-type metrics
        let per_type: HashMap<String, TypeMetrics> = standard_results
            .per_type
            .into_iter()
            .map(|(k, v)| {
                (
                    k,
                    TypeMetrics {
                        precision: v.precision,
                        recall: v.recall,
                        f1: v.f1,
                        support: v.expected,
                    },
                )
            })
            .collect();

        Ok(LowResourceResults {
            token_f1: standard_results.f1,
            morpheme_f1,
            char_f1,
            entity_density_ratio,
            transfer_efficiency,
            per_type,
            normalization_impact,
            metadata,
        })
    }

    /// Calculate character-level F1.
    ///
    /// This is more robust to tokenization differences across languages.
    fn calculate_char_f1(
        &self,
        model: &dyn Model,
        test_cases: &[(String, Vec<super::GoldEntity>)],
    ) -> Result<f64> {
        let mut total_gold_chars = 0;
        let mut total_pred_chars = 0;
        let mut total_correct_chars = 0;

        for (text, gold_entities) in test_cases {
            // Get predictions
            let predictions = model.extract_entities(text, None)?;

            // Create character-level gold mask
            let text_char_len = text.chars().count();
            let mut gold_mask = vec![false; text_char_len];
            for entity in gold_entities {
                let start = entity.start.min(text_char_len);
                let end = entity.end.min(text_char_len);
                for slot in gold_mask.iter_mut().take(end).skip(start) {
                    *slot = true;
                }
            }

            // Create character-level prediction mask
            let mut pred_mask = vec![false; text_char_len];
            for entity in &predictions {
                let start = entity.start().min(text_char_len);
                let end = entity.end().min(text_char_len);
                for slot in pred_mask.iter_mut().take(end).skip(start) {
                    *slot = true;
                }
            }

            // Count matches
            for i in 0..text_char_len {
                if gold_mask[i] {
                    total_gold_chars += 1;
                }
                if pred_mask[i] {
                    total_pred_chars += 1;
                }
                if gold_mask[i] && pred_mask[i] {
                    total_correct_chars += 1;
                }
            }
        }

        let precision = if total_pred_chars > 0 {
            total_correct_chars as f64 / total_pred_chars as f64
        } else {
            0.0
        };
        let recall = if total_gold_chars > 0 {
            total_correct_chars as f64 / total_gold_chars as f64
        } else {
            0.0
        };
        let f1 = if precision + recall > 0.0 {
            2.0 * precision * recall / (precision + recall)
        } else {
            0.0
        };

        Ok(f1)
    }

    /// Calculate morpheme-level F1 for polysynthetic languages.
    fn calculate_morpheme_f1(
        &self,
        model: &dyn Model,
        test_cases: &[(String, Vec<super::GoldEntity>)],
    ) -> Result<f64> {
        let config = self.morpheme_config.as_ref().ok_or_else(|| {
            Error::evaluation(
                "morpheme-level evaluation requested without MorphemeConfig (call with_morpheme_boundaries(true))",
            )
        })?;

        let mut total_gold_morphemes = 0;
        let mut total_pred_morphemes = 0;
        let mut total_correct_morphemes = 0;

        for (text, gold_entities) in test_cases {
            let predictions = model.extract_entities(text, None)?;

            // Count morphemes in gold entities
            for entity in gold_entities {
                let morpheme_count = entity
                    .text
                    .split(config.boundary_char)
                    .filter(|m| m.len() >= config.min_morpheme_len)
                    .count()
                    .max(1);
                total_gold_morphemes += morpheme_count;
            }

            // Count morphemes in predicted entities
            for entity in &predictions {
                // Note: entity.start/end are CHARACTER offsets, not byte offsets
                let char_count = text.chars().count();
                let entity_text: String = text
                    .chars()
                    .skip(entity.start())
                    .take(entity.end().min(char_count).saturating_sub(entity.start()))
                    .collect();
                let morpheme_count = entity_text
                    .split(config.boundary_char)
                    .filter(|m| m.len() >= config.min_morpheme_len)
                    .count()
                    .max(1);
                total_pred_morphemes += morpheme_count;

                // Check if this prediction matches a gold entity
                for gold in gold_entities {
                    if entity.start() == gold.start && entity.end() == gold.end {
                        total_correct_morphemes += morpheme_count;
                        break;
                    }
                }
            }
        }

        let precision = if total_pred_morphemes > 0 {
            total_correct_morphemes as f64 / total_pred_morphemes as f64
        } else {
            0.0
        };
        let recall = if total_gold_morphemes > 0 {
            total_correct_morphemes as f64 / total_gold_morphemes as f64
        } else {
            0.0
        };
        let f1 = if precision + recall > 0.0 {
            2.0 * precision * recall / (precision + recall)
        } else {
            0.0
        };

        Ok(f1)
    }

    /// Calculate impact of orthographic normalization.
    fn calculate_normalization_impact(
        &self,
        model: &dyn Model,
        test_cases: &[(String, Vec<super::GoldEntity>)],
    ) -> Result<NormalizationImpact> {
        let config = self.orthographic_config.as_ref().ok_or_else(|| {
            Error::evaluation(
                "normalization impact requested without OrthographicConfig (call with_orthographic_normalization(true))",
            )
        })?;

        // Evaluate without normalization
        let raw_results = super::evaluate_ner_model(model, test_cases)?;

        // Apply normalization to test cases
        let normalized_cases: Vec<(String, Vec<super::GoldEntity>)> = test_cases
            .iter()
            .map(|(text, entities)| {
                let normalized_text = self.normalize_text(text, config);
                let normalized_entities: Vec<super::GoldEntity> = entities
                    .iter()
                    .map(|e| super::GoldEntity {
                        text: self.normalize_text(&e.text, config),
                        entity_type: e.entity_type.clone(),
                        original_label: e.original_label.clone(),
                        start: e.start,
                        end: e.end,
                    })
                    .collect();
                (normalized_text, normalized_entities)
            })
            .collect();

        // Evaluate with normalization
        let normalized_results = super::evaluate_ner_model(model, &normalized_cases)?;

        // Count affected entities
        let mut entities_affected = 0;
        for ((orig_text, _), (norm_text, _)) in test_cases.iter().zip(normalized_cases.iter()) {
            if orig_text != norm_text {
                entities_affected += 1;
            }
        }

        Ok(NormalizationImpact {
            raw_f1: raw_results.f1,
            normalized_f1: normalized_results.f1,
            improvement: normalized_results.f1 - raw_results.f1,
            entities_affected,
        })
    }

    /// Apply orthographic normalization to text.
    fn normalize_text(&self, text: &str, config: &OrthographicConfig) -> String {
        let mut result = text.to_string();

        // Unicode normalization (NFC)
        if config.unicode_normalize {
            use unicode_normalization::UnicodeNormalization;
            result = result.nfc().collect();
        }

        // Case normalization
        if config.case_insensitive {
            result = result.to_lowercase();
        }

        // Diacritic removal
        if config.ignore_diacritics {
            result = remove_diacritics(&result);
        }

        // Custom character mappings
        for (from, to) in &config.char_mappings {
            result = result.replace(*from, &to.to_string());
        }

        result
    }
}

impl Default for LowResourceEvaluator {
    fn default() -> Self {
        Self::new()
    }
}

/// Remove diacritics from text.
fn remove_diacritics(text: &str) -> String {
    use unicode_normalization::UnicodeNormalization;
    text.nfd()
        .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
        .collect()
}

/// Create metadata for common Indigenous American languages.
pub fn language_metadata(language_code: &str) -> Option<LowResourceMetadata> {
    match language_code {
        // Quechua (Conchucos dialect - qxoRef)
        "qxo" => Some(LowResourceMetadata {
            language_code: "qxo".to_string(),
            language_family: Some("Quechuan".to_string()),
            is_polysynthetic: false, // Quechua is agglutinative, not polysynthetic
            has_standard_orthography: false,
            speaker_population: Some(200_000),
            endangerment_level: Some(EndangermentLevel::Vulnerable),
            training_examples: Some(12), // qxoRef has 12 documents
        }),
        // Cherokee
        "chr" => Some(LowResourceMetadata {
            language_code: "chr".to_string(),
            language_family: Some("Iroquoian".to_string()),
            is_polysynthetic: true,
            has_standard_orthography: true, // Cherokee syllabary is standardized
            speaker_population: Some(2_000),
            endangerment_level: Some(EndangermentLevel::SeverelyEndangered),
            training_examples: None,
        }),
        // Navajo
        "nav" => Some(LowResourceMetadata {
            language_code: "nav".to_string(),
            language_family: Some("Na-Dené".to_string()),
            is_polysynthetic: true,
            has_standard_orthography: true,
            speaker_population: Some(170_000),
            endangerment_level: Some(EndangermentLevel::Vulnerable),
            training_examples: None,
        }),
        // Guarani
        "gn" | "grn" => Some(LowResourceMetadata {
            language_code: "grn".to_string(),
            language_family: Some("Tupian".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true,
            speaker_population: Some(6_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: None,
        }),
        // Nahuatl
        "nah" => Some(LowResourceMetadata {
            language_code: "nah".to_string(),
            language_family: Some("Uto-Aztecan".to_string()),
            is_polysynthetic: true,
            has_standard_orthography: false, // Multiple competing orthographies
            speaker_population: Some(1_700_000),
            endangerment_level: Some(EndangermentLevel::Vulnerable),
            training_examples: None,
        }),
        // Shipibo-Konibo
        "shp" => Some(LowResourceMetadata {
            language_code: "shp".to_string(),
            language_family: Some("Panoan".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true,
            speaker_population: Some(35_000),
            endangerment_level: Some(EndangermentLevel::Vulnerable),
            training_examples: None,
        }),

        // =========================================================================
        // African Languages (MasakhaNER 2.0 languages)
        // =========================================================================

        // Swahili (Kiswahili) - East Africa's lingua franca
        "sw" | "swa" => Some(LowResourceMetadata {
            language_code: "swa".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative, not polysynthetic
            has_standard_orthography: true,
            speaker_population: Some(100_000_000), // Including L2 speakers
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(9_418), // MasakhaNER 2.0 train+dev+test
        }),

        // Yoruba - Tonal language with diacritics
        "yo" | "yor" => Some(LowResourceMetadata {
            language_code: "yor".to_string(),
            language_family: Some("Atlantic-Congo (Volta-Niger)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Standard with tone marks
            speaker_population: Some(45_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(9_824), // MasakhaNER 2.0
        }),

        // Hausa - Major West African trade language
        "ha" | "hau" => Some(LowResourceMetadata {
            language_code: "hau".to_string(),
            language_family: Some("Afro-Asiatic (Chadic)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Latin (Boko) standard
            speaker_population: Some(80_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(8_165), // MasakhaNER 2.0
        }),

        // Amharic - Ethiopian Semitic with Ge'ez script
        "am" | "amh" => Some(LowResourceMetadata {
            language_code: "amh".to_string(),
            language_family: Some("Afro-Asiatic (Semitic)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Ge'ez (Ethiopic) script
            speaker_population: Some(57_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(1_750), // MasakhaNER 1.0
        }),

        // Igbo - Tonal language of Nigeria
        "ig" | "ibo" => Some(LowResourceMetadata {
            language_code: "ibo".to_string(),
            language_family: Some("Atlantic-Congo (Volta-Niger)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Ă–nwu alphabet
            speaker_population: Some(45_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(10_905), // MasakhaNER 2.0
        }),

        // Kinyarwanda - Rwanda/Burundi
        "rw" | "kin" => Some(LowResourceMetadata {
            language_code: "kin".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(12_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(11_178), // MasakhaNER 2.0
        }),

        // Nigerian Pidgin - English-based creole
        "pcm" => Some(LowResourceMetadata {
            language_code: "pcm".to_string(),
            language_family: Some("English Creole".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: false, // No standardized spelling
            speaker_population: Some(100_000_000), // L2 speakers
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(7_746), // MasakhaNER 2.0
        }),

        // Wolof - Senegal/Gambia
        "wo" | "wol" => Some(LowResourceMetadata {
            language_code: "wol".to_string(),
            language_family: Some("Atlantic-Congo (Atlantic)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true,
            speaker_population: Some(12_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(6_561), // MasakhaNER 2.0
        }),

        // Zulu (isiZulu) - South Africa
        "zu" | "zul" => Some(LowResourceMetadata {
            language_code: "zul".to_string(),
            language_family: Some("Atlantic-Congo (Bantu/Nguni)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(27_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(8_354), // MasakhaNER 2.0
        }),

        // Xhosa (isiXhosa) - South Africa, with clicks
        "xh" | "xho" => Some(LowResourceMetadata {
            language_code: "xho".to_string(),
            language_family: Some("Atlantic-Congo (Bantu/Nguni)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(19_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(8_168), // MasakhaNER 2.0
        }),

        // Luganda - Uganda
        "lg" | "lug" => Some(LowResourceMetadata {
            language_code: "lug".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(10_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(7_060), // MasakhaNER 2.0
        }),

        // Luo (Dholuo) - Kenya/Tanzania
        "luo" => Some(LowResourceMetadata {
            language_code: "luo".to_string(),
            language_family: Some("Nilo-Saharan (Nilotic)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true,
            speaker_population: Some(6_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(7_372), // MasakhaNER 2.0
        }),

        // Twi (Akan) - Ghana
        "tw" | "twi" | "aka" => Some(LowResourceMetadata {
            language_code: "twi".to_string(),
            language_family: Some("Atlantic-Congo (Kwa)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Tonal diacritics
            speaker_population: Some(11_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(6_056), // MasakhaNER 2.0
        }),

        // Shona (chiShona) - Zimbabwe
        "sn" | "sna" => Some(LowResourceMetadata {
            language_code: "sna".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(15_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(8_867), // MasakhaNER 2.0
        }),

        // Tigrinya - Eritrea/Ethiopia, Ge'ez script
        "ti" | "tir" => Some(LowResourceMetadata {
            language_code: "tir".to_string(),
            language_family: Some("Afro-Asiatic (Semitic)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Ge'ez script
            speaker_population: Some(9_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: None, // Not in MasakhaNER, but in AfriSenti
        }),

        // Bambara - Mali
        "bm" | "bam" => Some(LowResourceMetadata {
            language_code: "bam".to_string(),
            language_family: Some("Atlantic-Congo (Mande)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // N'Ko or Latin
            speaker_population: Some(14_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(6_375), // MasakhaNER 2.0
        }),

        // Ewe - Ghana/Togo
        "ee" | "ewe" => Some(LowResourceMetadata {
            language_code: "ewe".to_string(),
            language_family: Some("Atlantic-Congo (Kwa)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Tonal diacritics
            speaker_population: Some(7_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(5_007), // MasakhaNER 2.0
        }),

        // Fon - Benin
        "fon" => Some(LowResourceMetadata {
            language_code: "fon".to_string(),
            language_family: Some("Atlantic-Congo (Kwa)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true, // Tonal diacritics
            speaker_population: Some(2_200_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(6_204), // MasakhaNER 2.0
        }),

        // Mossi (Mooré) - Burkina Faso
        "mos" => Some(LowResourceMetadata {
            language_code: "mos".to_string(),
            language_family: Some("Atlantic-Congo (Gur)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: true,
            speaker_population: Some(8_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(6_793), // MasakhaNER 2.0
        }),

        // Setswana - Botswana/South Africa
        "tn" | "tsn" => Some(LowResourceMetadata {
            language_code: "tsn".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(8_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(4_784), // MasakhaNER 2.0
        }),

        // Chichewa (Nyanja) - Malawi
        "ny" | "nya" => Some(LowResourceMetadata {
            language_code: "nya".to_string(),
            language_family: Some("Atlantic-Congo (Bantu)".to_string()),
            is_polysynthetic: false, // Agglutinative
            has_standard_orthography: true,
            speaker_population: Some(15_000_000),
            endangerment_level: Some(EndangermentLevel::Safe),
            training_examples: Some(8_928), // MasakhaNER 2.0
        }),

        // Ghomala - Cameroon (lower resource)
        "bbj" => Some(LowResourceMetadata {
            language_code: "bbj".to_string(),
            language_family: Some("Atlantic-Congo (Grassfields Bantu)".to_string()),
            is_polysynthetic: false,
            has_standard_orthography: false, // Developing
            speaker_population: Some(1_000_000),
            endangerment_level: Some(EndangermentLevel::Vulnerable),
            training_examples: Some(4_833), // MasakhaNER 2.0
        }),

        _ => None,
    }
}

/// MasakhaNER 2.0 language codes.
///
/// These codes can be used to load specific language splits from MasakhaNER 2.0.
/// Example: `load_dataset("masakhane/masakhaner2", "yor")` for Yoruba.
pub const MASAKHANER2_LANGUAGES: &[(&str, &str)] = &[
    ("bam", "Bambara"),
    ("bbj", "Ghomala"),
    ("ewe", "Ewe"),
    ("fon", "Fon"),
    ("hau", "Hausa"),
    ("ibo", "Igbo"),
    ("kin", "Kinyarwanda"),
    ("lug", "Luganda"),
    ("luo", "Dholuo"),
    ("mos", "Mossi"),
    ("nya", "Chichewa"),
    ("pcm", "Nigerian Pidgin"),
    ("sna", "Shona"),
    ("swa", "Swahili"),
    ("tsn", "Setswana"),
    ("twi", "Twi"),
    ("wol", "Wolof"),
    ("xho", "Xhosa"),
    ("yor", "Yoruba"),
    ("zul", "Zulu"),
];

/// Get language name from MasakhaNER 2.0 code.
pub fn masakhaner2_language_name(code: &str) -> Option<&'static str> {
    MASAKHANER2_LANGUAGES
        .iter()
        .find(|(c, _)| *c == code)
        .map(|(_, name)| *name)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_language_metadata() {
        let quechua = language_metadata("qxo").unwrap();
        assert_eq!(quechua.language_family, Some("Quechuan".to_string()));
        assert!(!quechua.is_polysynthetic);

        let cherokee = language_metadata("chr").unwrap();
        assert!(cherokee.is_polysynthetic);
        assert_eq!(
            cherokee.endangerment_level,
            Some(EndangermentLevel::SeverelyEndangered)
        );
    }

    #[test]
    fn test_orthographic_normalization() {
        let config = OrthographicConfig {
            unicode_normalize: true,
            case_insensitive: true,
            ignore_diacritics: true,
            char_mappings: HashMap::new(),
        };

        let evaluator = LowResourceEvaluator::new();
        let normalized = evaluator.normalize_text("Café", &config);
        assert_eq!(normalized, "cafe");
    }

    #[test]
    fn test_evaluator_creation() {
        let evaluator = LowResourceEvaluator::new()
            .with_morpheme_boundaries(MorphemeConfig::default())
            .with_orthographic_normalization(OrthographicConfig::default())
            .with_english_baseline(0.92);

        assert!(evaluator.morpheme_config.is_some());
        assert!(evaluator.orthographic_config.is_some());
        assert_eq!(evaluator.english_baseline_f1, Some(0.92));
    }
}