anno-eval 0.10.0

    // =========================================================================
    // Coreference Datasets
    // =========================================================================
    GAP {
        name: "GAP",
        description: "Gender Ambiguous Pronoun resolution. Google's benchmark for exposing gender bias in coreference systems.",
        url: "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv",
        entity_types: ["PER"],
        language: "en",
        domain: "wikipedia",
        license: "Apache-2.0",
        citation: "Webster et al. (2018)",
        paper_url: "https://aclanthology.org/Q18-1042/",
        year: 2018,
        format: "TSV",
        size_hint: "8,908 pronoun-name pairs",
        example: "ID\tText\tPronoun\tA\tB\tA-coref\ntest-1\tZoe met Alice and she waved.\tshe\tZoe\tAlice\tFALSE",
        notes: "Designed to expose gender bias; Kaggle shared task; balanced male/female",
        tasks: ["coref"],
        hf_id: "google-gap-coreference/gap",
        categories: [coref, bias_evaluation],
    },
    PreCo {
        name: "PreCo",
        description: "Large-scale coreference from PreCo reading comprehension corpus. 10x larger than OntoNotes.",
        url: "https://huggingface.co/datasets/coref-data/preco_raw",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "reading_comprehension",
        license: "CC-BY-4.0",
        citation: "Chen et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1016/",
        year: 2018,
        format: "JSONL",
        annotation_scheme: "CoNLLCoref",
        size_hint: "38k documents, includes singletons",
        notes: "Preschool vocabulary for cleaner evaluation; largest public coref corpus",
        splits: ["train", "dev", "test"],
        tasks: ["coref"],
        hf_id: "coref-data/preco_raw",
        categories: [coref],
    },
    LitBank {
        name: "LitBank",
        description: "Literary coreference. 100 public-domain English fiction works (1719-1922) with ACE-style entities.",
        url: "https://raw.githubusercontent.com/dbamman/litbank/master/coref/brat/1023_bleak_house_brat.ann",
        entity_types: ["PER", "LOC", "ORG", "GPE", "FAC", "VEH"],
        language: "en",
        domain: "literature",
        license: "CC-BY-4.0",
        citation: "Bamman et al. (2019)",
        paper_url: "https://aclanthology.org/P19-1353/",
        year: 2019,
        format: "BRAT",
        annotation_scheme: "Standoff",
        size_hint: "100 novels, ~2k tokens each",
        notes: "Focus on character coreference; includes event coref; public domain texts",
        splits: ["all"],
        tasks: ["ner", "coref", "event_coref"],
        expected_docs: 100,
        categories: [coref, literary],
    },
    ECBPlus {
        name: "ECB+",
        description: "Event Coreference Bank Plus. Standard benchmark for cross-document event coreference resolution.",
        url: "https://raw.githubusercontent.com/cltl/ecbPlus/master/ECB%2B_LREC2014/ECB%2B.zip",
        entity_types: ["EVENT", "TIME", "LOC", "PARTICIPANT"],
        language: "en",
        domain: "news",
        license: "CC-BY-3.0",
        citation: "Cybulska & Vossen (2014)",
        paper_url: "https://aclanthology.org/L14-1646/",
        year: 2014,
        format: "XML-ZIP",
        size_hint: "43 topics, 982 docs, ~7k events",
        example: "Doc1: 'The earthquake [struck] at 3am.' Doc2: 'The [tremor] caused damage.'\nEvents: struck_1, tremor_2 -> coreferent (same event)",
        notes: "De facto CDCR standard; topic-clustered structure may cause overfitting",
        splits: ["train", "dev", "test"],
        tasks: ["coref", "event_coref", "cdcr"],
        expected_docs: 982,
        categories: [coref, event_coref],
    },

    OntoNotesCoref {
        name: "OntoNotes Coreference",
        description: "OntoNotes 5.0 coreference annotations. Gold-standard multi-genre coref including WSJ, broadcast, web.",
        url: "https://catalog.ldc.upenn.edu/LDC2013T19",
        entity_types: ["PER", "ORG", "GPE", "NORP"],
        language: "en",
        domain: "mixed",
        license: "LDC",
        citation: "Pradhan et al. (2012)",
        paper_url: "https://aclanthology.org/W12-4501/",
        year: 2012,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        size_hint: "3,493 documents, ~1.6M tokens",
        notes: "De facto standard for within-document coreference evaluation",
        categories: [coref],
    },

    WikiCoref {
        name: "WikiCoref",
        description: "Wikipedia coreference corpus. 30 documents with full coreference annotation.",
        url: "",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-SA-4.0",
        citation: "Ghaddar & Langlais (2016)",
        paper_url: "https://aclanthology.org/C16-1252/",
        year: 2016,
        format: "CoNLL",
        size_hint: "30 documents, ~60k tokens",
        notes: "Long documents averaging 2k tokens; challenging for span-based models. Prior download URL has an expired TLS cert; needs a fresh mirror.",
        access_status: Deprecated,
        categories: [coref],
    },

    ARRAU3 {
        name: "ARRAU 3.0",
        description: "Anaphora Resolution and Underspecification corpus version 3. Multi-genre with rich annotation.",
        url: "https://aclanthology.org/2024.codi-1.12/",
        entity_types: ["PER", "ORG", "LOC", "Event"],
        language: "en",
        domain: "mixed",
        license: "Research",
        citation: "Uryupina et al. (2024)",
        paper_url: "https://aclanthology.org/2024.codi-1.12/",
        year: 2024,
        format: "MMAX2",
        annotation_scheme: "ARRAU",
        size_hint: "~350k tokens across multiple genres",
        notes: "Rich annotation including bridging, discourse deixis, and ambiguity",
        categories: [coref],
    },

    AMIMeeting {
        name: "AMI Meeting",
        description: "Meeting transcripts with coreference and dialogue act annotation.",
        url: "https://groups.inf.ed.ac.uk/ami/download/",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "dialogue",
        license: "CC-BY-4.0",
        citation: "Carletta et al. (2005)",
        paper_url: "https://groups.inf.ed.ac.uk/ami/icsi/",
        year: 2005,
        format: "XML",
        size_hint: "100 hours of meetings",
        notes: "Multi-party dialogue; includes prosody and head gestures",
        categories: [coref, dialogue],
    },

    CLEFClinicalCoref {
        name: "CLEF Clinical Coreference",
        description: "Clinical coreference from ShARe/CLEF eHealth. Patient records with coref.",
        url: "",
        entity_types: ["Disorder", "Drug", "Procedure"],
        language: "en",
        domain: "clinical",
        license: "PhysioNet",
        citation: "Suominen et al. (2013)",
        paper_url: "https://clef2013.clef-initiative.eu/index.php?page=pages/proceedings.php",
        year: 2013,
        format: "Standoff",
        size_hint: "298 discharge summaries",
        notes: "Clinical concept coreference; disorder mentions across sentences. Was hosted on PhysioNet; URL appears gone/renamed; requires controlled access.",
        access_status: Registration,
        categories: [coref, biomedical],
    },

    RSTDT {
        name: "RST Discourse Treebank",
        description: "Penn Discourse Treebank with RST annotations. Discourse relations and structure.",
        url: "https://catalog.ldc.upenn.edu/LDC2002T07",
        entity_types: [],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Carlson et al. (2001)",
        paper_url: "https://aclanthology.org/A00-1036/",
        year: 2001,
        format: "Custom",
        size_hint: "385 WSJ articles",
        notes: "RST discourse structure; useful for discourse-aware coreference",
        categories: [coref],
    },

    WinoBias {
        name: "WinoBias",
        description: "Coreference bias benchmark. Winograd-schema sentences testing occupational gender stereotypes.",
        url: "https://raw.githubusercontent.com/uclanlp/corefBias/master/WinoBias/wino/data/anti_stereotyped_type1.txt.dev",
        entity_types: ["PER"],
        language: "en",
        domain: "evaluation",
        license: "MIT",
        citation: "Zhao et al. (2018)",
        paper_url: "https://aclanthology.org/N18-2003/",
        year: 2018,
        format: "Custom",
        size_hint: "3,160 sentences",
        notes: "Type 1 (syntactic) and Type 2 (semantic) splits; tests BLS occupational stats",
        categories: [coref, bias_evaluation],
    },

    // =========================================================================
    // Indigenous Language Datasets
    // =========================================================================
    QxoRef {
        name: "qxoRef",
        description: "First coreference corpus for Conchucos Quechua. Historically significant as first indigenous coref resource.",
        url: "https://raw.githubusercontent.com/elizabethpankratz/qxoRef/f0eb5716573b3f428bfcfdda923b195d0e7967b8/qxoRef_AZ23.conll",
        entity_types: ["PER", "LOC", "ORG"],
        language: "qxo",
        domain: "narrative",
        license: "CC-BY-NC-SA-4.0",
        citation: "Rios (2021)",
        paper_url: "https://aclanthology.org/2021.americasnlp-1.1/",
        year: 2021,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        size_hint: "12 docs, 332 mentions",
        notes: "First indigenous coreference corpus; pro-drop language; agglutinative morphology",
        categories: [coref, indigenous, low_resource],
    },
    AmericasNLI {
        name: "AmericasNLI",
        description: "NLI for 10 Indigenous American languages (Quechua, Guaraní, Nahuatl, etc.).",
        url: "https://raw.githubusercontent.com/nala-cub/AmericasNLI/be3c351b7e1ae69936c61bfde3e24f30757db9ac/test.tsv",
        entity_types: [],
        language: "mul",
        domain: "general",
        license: "CC-BY-4.0",
        citation: "Ebrahimi et al. (2022)",
        paper_url: "https://aclanthology.org/2022.acl-long.435/",
        year: 2022,
        format: "TSV",
        notes: "Tests zero-shot transfer from multilingual models; 10 indigenous languages",
        categories: [indigenous, multilingual, low_resource],
    },
    CherokeeNER {
        name: "Cherokee NER",
        description: "Cherokee-English parallel corpus for NER transfer. Uses Syllabary script.",
        url: "",
        entity_types: ["PER", "LOC", "ORG"],
        language: "chr",
        domain: "indigenous",
        license: "Research",
        citation: "Zhang et al. (2020)",
        paper_url: "https://aclanthology.org/2020.findings-emnlp.464/",
        year: 2020,
        format: "Custom",
        notes: "Syllabary script (85 characters); polysynthetic language; ~7k speakers. Prior URL is dead; needs a fresh mirror.",
        access_status: Deprecated,
        categories: [ner, indigenous, low_resource],
    },
    NahuatlNER {
        name: "Nahuatl NER",
        description: "Named entity recognition for Nahuatl (Aztec language). Colonial-era texts and modern usage.",
        url: "",
        entity_types: ["PER", "LOC", "ORG"],
        language: "nah",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "Gutierrez-Vasquez et al. (2023)",
        year: 2023,
        format: "CoNLL",
        notes: "Polysynthetic Uto-Aztecan language; ~1.7M speakers; includes colonial manuscripts. Prior URL is dead; needs a fresh mirror.",
        access_status: Deprecated,
        categories: [ner, indigenous, low_resource, historical],
    },
    MaoriNER {
        name: "Māori NER",
        description: "Named entity recognition for Te Reo Māori. New Zealand indigenous language corpus.",
        url: "",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mi",
        domain: "indigenous",
        license: "Research",
        citation: "Te Hiku Media (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Polynesian language; ~50k fluent speakers; limited training data available",
        access_status: ContactAuthors,
        categories: [ner, indigenous, low_resource],
    },
    WelshNER {
        name: "Welsh NER",
        description: "Named entity recognition for Welsh (Cymraeg). Celtic language NER corpus.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "cy",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Roberts et al. (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Celtic language; ~900k speakers; supports Welsh-specific entity types. Prior URL is dead; needs a fresh mirror.",
        access_status: Deprecated,
        categories: [ner, indigenous, low_resource],
    },
    BasqueNER {
        name: "Basque NER",
        description: "Named entity recognition for Basque (Euskara). Language isolate NER corpus.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "eu",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Alegria et al. (2019)",
        year: 2019,
        format: "CoNLL",
        size_hint: "~80k tokens",
        notes: "Language isolate; agglutinative morphology; ~750k speakers; ergative-absolutive alignment. Prior URL is dead; needs a fresh mirror.",
        access_status: Deprecated,
        categories: [ner, indigenous, low_resource],
    },

    // =========================================================================
    // Historical NER Datasets
    // =========================================================================
    HIPE2022 {
        name: "HIPE-2022",
        description: "Multilingual Historical NER. 6 datasets across 11 languages including Latin.",
        url: "https://raw.githubusercontent.com/hipe-eval/HIPE-2022-data/147f5bc3c7fb7e5c6b024a9ffd6503cd019fb9ea/data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-test-de.tsv",
        entity_types: ["PER", "LOC", "ORG", "PROD"],
        language: "mul",
        domain: "historical",
        license: "CC-BY-NC-4.0",
        citation: "Ehrmann et al. (2022)",
        paper_url: "https://ceur-ws.org/Vol-3180/paper-83.pdf",
        year: 2022,
        format: "TSV",
        annotation_scheme: "IOB2",
        notes: "CLEF-HIPE shared task; includes Latin and Classical commentary; OCR noise",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, historical, multilingual],
    },
    HistNERo {
        name: "HistNERo",
        description: "Romanian historical newspaper NER. First Romanian historical NER corpus from four regions.",
        url: "https://github.com/avramandrei/histnero",
        entity_types: ["PER", "LOC", "ORG", "DATE", "MISC"],
        language: "ro",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "HistNERo Team (2024)",
        paper_url: "https://arxiv.org/abs/2405.00155",
        year: 2024,
        format: "CoNLL",
        size_hint: "~323k tokens, 19th-20th century newspapers",
        notes: "Four historical Romanian regions (Bessarabia, Moldavia, Transylvania, Wallachia); diachronic benchmark",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, historical, low_resource],
    },
    QuaeroOldPress {
        name: "Quaero Old Press",
        description: "French historical newspaper NER from 1890. OCR-corrected with manual NE annotations.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "TIME", "PROD"],
        language: "fr",
        domain: "historical",
        license: "Research",
        citation: "Galibert et al. (2012)",
        year: 2012,
        format: "XML",
        size_hint: "295 pages, 1890 newspapers",
        notes: "French historical NER benchmark; manual OCR corrections; reasonably clean historical text",
        splits: ["test"],
        tasks: ["ner"],
        access_status: ContactAuthors,
        categories: [ner, historical],
    },
    HistoricalChineseNER {
        name: "Historical Chinese NER",
        description: "Multi-task historical Chinese corpus. NER + entity linking + coreference + relations.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "TIME", "OFFICIAL"],
        language: "zh",
        domain: "historical",
        license: "Research",
        citation: "LREC-COLING (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.35.pdf",
        year: 2024,
        format: "JSONL",
        size_hint: "Historical Chinese newspapers + documents",
        notes: "LREC-COLING 2024; multi-task historical IE benchmark; cross-genre historical Chinese",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "el", "coref", "re"],
        access_status: ContactAuthors,
        categories: [ner, entity_linking, coref, historical, multilingual],
    },
    CHisIEC {
        name: "CHisIEC",
        description: "Chinese Historical Information Extraction Corpus. Ancient Chinese NER + RE with 12 relation types.",
        url: "https://raw.githubusercontent.com/tangxuemei1995/CHisIEC/main/data/re/coling_test.json",
        entity_types: ["PER", "LOC", "OFI", "BOOK"],
        language: "lzh",
        domain: "historical",
        license: "Research",
        citation: "Tang et al. (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.283/",
        year: 2024,
        format: "JSON",
        size_hint: "3,891 paragraphs, 13,520 entities, 8,228 relations",
        notes: "Ancient Chinese dynastic histories (24史); 12 domain-specific relations for historical socio-political structures; pre-modern Chinese (文言文)",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "re"],
        categories: [ner, relation_extraction, historical, ancient],
    },