anno-eval 0.10.0

Evaluation harnesses, datasets, and muxer-backed sampling for anno
    // =========================================================================
    // Biomedical NER Datasets
    // =========================================================================
    BC5CDR {
        name: "BC5CDR",
        description: "Biomedical NER for diseases and chemicals. Created for BioCreative V CDR task, a major biomedical NLP benchmark.",
        url: "https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/test.txt",
        entity_types: ["Chemical", "Disease"],
        language: "en",
        domain: "biomedical",
        license: "Public",
        citation: "Li et al. (2016)",
        paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414",
        year: 2016,
        format: "BIO",
        annotation_scheme: "BIO",
        size_hint: "~1500 PubMed abstracts, ~14k mentions",
        example: "Aspirin B-Chemical\ninduced O\nhepatotoxicity B-Disease\nwas O\nobserved O\n. O",
        tasks: ["ner"],
        hf_id: "tner/bc5cdr",
        categories: [ner, biomedical],
    },
    NCBIDisease {
        name: "NCBI Disease",
        description: "NCBI disease mentions corpus. Foundational resource for disease NER from NIH.",
        url: "https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/NCBI-disease/test.txt",
        entity_types: ["Disease"],
        language: "en",
        domain: "biomedical",
        license: "Public",
        citation: "Dogan et al. (2014)",
        paper_url: "https://www.sciencedirect.com/science/article/pii/S1532046413001974",
        year: 2014,
        format: "BIO",
        annotation_scheme: "BIO",
        size_hint: "~800 PubMed abstracts, ~6k mentions",
        example: "The O\npatient O\nwas O\ndiagnosed O\nwith O\ntype B-Disease\n2 I-Disease\ndiabetes I-Disease\n. O",
        tasks: ["ner"],
        hf_id: "ncbi_disease",
        categories: [ner, biomedical],
    },
    GENIA {
        name: "GENIA",
        description: "Biomedical NER for molecular biology. First large-scale biomedical NER corpus; historically significant.",
        url: "https://huggingface.co/datasets/chufangao/GENIA-NER",
        entity_types: ["DNA", "RNA", "protein", "cell_line", "cell_type"],
        language: "en",
        domain: "biomedical",
        license: "GENIA Project License",
        citation: "Kim et al. (2003)",
        paper_url: "https://academic.oup.com/bioinformatics/article/19/suppl_1/i180/227927",
        year: 2003,
        format: "XML",
        annotation_scheme: "Standoff",
        size_hint: "2000 MEDLINE abstracts, ~100k entities",
        example: "The B-protein\nNF-kappa B I-protein\nprotein I-protein\nbinds O\nto O\nthe B-DNA\nkappa B I-DNA\nbinding I-DNA\nsite I-DNA\n. O",
        notes: "Nested entities common; requires special handling; pioneered biomedical NER",
        hf_id: "chufangao/GENIA-NER",
        categories: [ner, biomedical],
    },

    AnatEM {
        name: "AnatEM",
        description: "Anatomical entity mention corpus. 1,212 PubMed abstracts with anatomical structures.",
        url: "https://huggingface.co/datasets/disi-unibo-nlp/AnatEM",
        entity_types: ["Anatomy"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Ohta et al. (2012)",
        paper_url: "https://aclanthology.org/W12-2402/",
        year: 2012,
        format: "Standoff",
        size_hint: "1,212 abstracts, ~7k entity mentions",
        notes: "Fine-grained anatomical mentions; standalone or nested within other entities",
        hf_id: "disi-unibo-nlp/AnatEM",
        categories: [ner, biomedical],
    },

    BC2GM {
        name: "BC2GM",
        description: "BioCreative II Gene Mention recognition. Gold-standard gene/protein name tagging.",
        url: "https://huggingface.co/datasets/spyysalo/bc2gm_corpus",
        entity_types: ["Gene", "Protein"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Smith et al. (2008)",
        paper_url: "https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-s2-s2",
        year: 2008,
        format: "IOB2",
        size_hint: "20k sentences, ~24k gene mentions",
        example: "The B-Gene\np53 I-Gene\nprotein I-Gene\nregulates O\ncell O\ncycle O\narrest O\n. O",
        notes: "Classic benchmark for gene/protein NER; BioCreative shared task",
        tasks: ["ner"],
        hf_id: "spyysalo/bc2gm_corpus",
        access_status: Public,
        categories: [ner, biomedical],
    },

    BC4CHEMD {
        name: "BC4CHEMD",
        description: "BioCreative IV Chemical Entity Mention Detection. Drug and chemical name recognition.",
        url: "https://huggingface.co/datasets/chintagunta85/bc4chemd",
        entity_types: ["Chemical"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Krallinger et al. (2015)",
        paper_url: "https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2",
        year: 2015,
        format: "IOB2",
        size_hint: "10k PubMed abstracts, ~84k chemical mentions",
        example: "Treatment O\nwith O\nB-Chemical\naspirin I-Chemical\nreduced O\ninflammation O\n. O",
        notes: "Chemical NER benchmark; includes IUPAC names, trivial names, abbreviations",
        tasks: ["ner"],
        hf_id: "chintagunta85/bc4chemd",
        access_status: Public,
        categories: [ner, biomedical],
    },