anno-eval 0.10.0

    // =========================================================================
    // Entity Linking Datasets
    // =========================================================================
    ZELDA {
        name: "ZELDA",
        description: "Entity disambiguation benchmark. 95k Wikipedia paragraphs, 8 ED datasets unified.",
        url: "https://raw.githubusercontent.com/flairNLP/zelda/main/test_data/conll/test_aida-b.conll",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "wikipedia",
        license: "MIT",
        citation: "Milich & Akbik (2023)",
        paper_url: "https://aclanthology.org/2023.eacl-main.151/",
        year: 2023,
        format: "CoNLL",
        size_hint: "95k paragraphs, 825k entities",
        notes: "Standardized ED evaluation; Wikipedia KB; no emerging entities; using AIDA-B test subset",
        splits: ["test"],
        tasks: ["el", "ner"],
        access_status: Public,
        categories: [ner, entity_linking],
    },
    TweetNERD {
        name: "TweetNERD",
        description: "Twitter NER + Entity Linking. End-to-end NERD benchmark spanning 2010-2021.",
        url: "https://zenodo.org/records/6617192",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Mishra et al. (2022)",
        paper_url: "https://arxiv.org/abs/2210.08129",
        year: 2022,
        format: "JSONL",
        size_hint: "340k+ tweets",
        notes: "NeurIPS 2022; temporal drift; NER + EL + end-to-end NERD",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "el"],
        categories: [ner, social_media],
    },
    AIDACoNLL {
        name: "AIDA-CoNLL",
        description: "Primary entity linking benchmark linking CoNLL-2003 mentions to Wikipedia. De-facto standard for end-to-end EL evaluation.",
        url: "https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Hoffart et al. (2011)",
        paper_url: "https://aclanthology.org/D11-1072/",
        year: 2011,
        format: "CoNLL",
        annotation_scheme: "IOB2",
        size_hint: "~1,400 docs, ~34k mentions linked to Wikipedia",
        notes: "Built on Reuters CoNLL-2003; AIDA-train/A/B splits; foundational EL benchmark; YAGO KB",
        splits: ["train", "testa", "testb"],
        tasks: ["ner", "el", "entity_linking", "ned"],
        categories: [ner, entity_linking],
    },

    // =========================================================================
    // Additional Nested NER
    // =========================================================================
    ACE2005 {
        name: "ACE 2005",
        description: "Automatic Content Extraction 2005. Nested NER + relations + events.",
        url: "",  // Requires LDC license
        entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "WEA", "VEH"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Walker et al. (2006)",
        paper_url: "https://catalog.ldc.upenn.edu/LDC2006T06",
        year: 2005,
        format: "XML",
        annotation_scheme: "Standoff",
        size_hint: "~600 documents",
        notes: "Gold standard for nested NER; includes Arabic/Chinese; defines modern IE evaluation",
        access_status: Registration,
        categories: [ner, nested_ner, relation_extraction],
    },
    NNE {
        name: "NNE (Nested Named Entities)",
        description: "Large-scale nested NER corpus from Wikipedia/news. Deep nesting up to 6 levels.",
        url: "https://github.com/nickyringland/nested_named_entities",
        entity_types: ["PER", "LOC", "ORG", "GPE", "NORP", "FAC", "PRODUCT", "EVENT", "WORK", "LAW"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Ringland et al. (2019)",
        paper_url: "https://aclanthology.org/P19-1510/",
        year: 2019,
        format: "CoNLL",
        size_hint: "~280k tokens, deep nesting",
        notes: "ACL 2019; based on ACE/OntoNotes; up to 6 nested levels; stress test for nested NER",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, nested_ner],
    },
    GENIANested {
        name: "GENIA Nested",
        description: "Biomedical nested NER from GENIA corpus. Up to 3 levels of nesting.",
        url: "https://raw.githubusercontent.com/thecharm/boundary-aware-nested-ner/master/Our_boundary-aware_model/data/genia/genia.test.iob2",
        entity_types: ["DNA", "RNA", "PROTEIN", "CELL_LINE", "CELL_TYPE"],
        language: "en",
        domain: "biomedical",
        license: "GENIA Project License",
        citation: "Kim et al. (2003)",
        paper_url: "https://aclanthology.org/W03-1302/",
        year: 2003,
        format: "CoNLL",
        size_hint: "~2k abstracts",
        example: "[[IL-2 receptor] alpha chain] promoter\n[IL-2 receptor]: PROTEIN, [IL-2 receptor alpha chain]: PROTEIN (nested)",
        notes: "Canonical biomedical nested NER benchmark; used alongside ACE for nested NER evaluation",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, nested_ner, biomedical],
    },
    ChineseNestedNER {
        name: "Chinese Nested NER",
        description: "Chinese nested named entity recognition. Multiple levels of embedded entities.",
        url: "https://github.com/LeeSureman/Nested-NER",
        entity_types: ["PER", "ORG", "LOC", "GPE"],
        language: "zh",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Wang et al. (2020)",
        year: 2020,
        format: "JSONL",
        size_hint: "~20k sentences",
        notes: "Chinese nested NER benchmark; designed for span-based model evaluation; CJK characters",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, nested_ner, multilingual],
    },
    SCINERNested {
        name: "SciNER Nested",
        description: "Scientific paper NER with nested annotations. Methods, tasks, and datasets.",
        url: "https://github.com/allenai/sciie",
        entity_types: ["TASK", "METHOD", "METRIC", "MATERIAL", "GENERIC"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Luan et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1360/",
        year: 2018,
        format: "JSONL",
        size_hint: "~500 abstracts",
        notes: "Scientific information extraction; nested spans common in methodology descriptions",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "re"],
        categories: [ner, nested_ner, arcane_domain],
    },

    // =========================================================================
    // Additional Discontinuous NER
    // =========================================================================
    ShAReCLEF {
        name: "ShARe/CLEF",
        description: "Shared Annotated Resources for clinical NER. ShARe/CLEF eHealth shared task.",
        url: "",  // Research access via PhysioNet
        entity_types: ["DISORDER", "FINDING", "PROCEDURE"],
        language: "en",
        domain: "clinical",
        license: "PhysioNet",
        citation: "Pradhan et al. (2013)",
        paper_url: "https://aclanthology.org/S13-2056/",
        year: 2013,
        format: "BRAT",
        annotation_scheme: "Standoff",
        size_hint: "~300 clinical notes",
        notes: "Discontinuous clinical entities; SNOMED-CT normalization; de-identified records",
        access_status: Registration,
        categories: [ner, biomedical, discontinuous_ner],
    },
    GermEvalDiscontinuous {
        name: "GermEval Discontinuous",
        description: "German discontinuous NER from GermEval 2014. Non-contiguous entity spans.",
        url: "https://sites.google.com/site/germaboreval/data",
        entity_types: ["PER", "ORG", "LOC", "OTH"],
        language: "de",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Benikova et al. (2014)",
        paper_url: "https://aclanthology.org/W14-1707/",
        year: 2014,
        format: "CoNLL",
        size_hint: "~87k tokens",
        notes: "German discontinuous entities; derived entities; embedded entities",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, discontinuous_ner, multilingual],
    },
    ADRDiscontinuous {
        name: "ADR Discontinuous",
        description: "Adverse Drug Reaction corpus with discontinuous mentions. Patient forum posts.",
        url: "https://github.com/Aitslab/ADR-DisNER",
        entity_types: ["ADR", "DRUG", "SYMPTOM"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Metke-Jimenez et al. (2016)",
        year: 2016,
        format: "BRAT",
        size_hint: "~2k posts",
        notes: "Social media ADR mentions; many discontinuous spans; health forum text",
        categories: [ner, biomedical, discontinuous_ner, social_media],
    },
    PubMedDiscontinuous {
        name: "PubMed Discontinuous",
        description: "PubMed abstracts with discontinuous biomedical entities. Complex entity boundaries.",
        url: "https://github.com/dmis-lab/discontinuous-ner",
        entity_types: ["CHEMICAL", "DISEASE", "GENE"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Dai et al. (2020)",
        year: 2020,
        format: "CoNLL",
        size_hint: "~8k abstracts",
        notes: "Scientific abstracts; discontinuous chemical and disease mentions",
        categories: [ner, biomedical, discontinuous_ner],
    },

    // =========================================================================
    // Additional Relation Extraction
    // =========================================================================
    TACRED {
        name: "TACRED",
        description: "TAC Relation Extraction Dataset. 42 relations from TAC KBP.",
        url: "",  // LDC license
        entity_types: ["PER", "ORG"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Zhang et al. (2017)",
        paper_url: "https://aclanthology.org/D17-1004/",
        year: 2017,
        format: "JSONL",
        size_hint: "106k examples",
        example: "subj: 'Tim Cook', obj: 'Apple', relation: per:employee_of, text: 'Tim Cook is the CEO of Apple Inc.'",
        notes: "42 relations; majority no_relation; known label noise; Re-TACRED fixes some issues",
        access_status: Registration,
        categories: [relation_extraction],
    },
    SemEval2010Task8 {
        name: "SemEval-2010 Task 8",
        description: "Semantic relation classification between nominals. 9 relation types.",
        url: "https://github.com/sahitya0000/Relation-Classification",
        entity_types: ["e1", "e2"],  // Entity markers for relation endpoints
        language: "en",
        domain: "mixed",
        license: "Research",
        citation: "Hendrickx et al. (2010)",
        paper_url: "https://aclanthology.org/S10-1006/",
        year: 2010,
        format: "Custom",
        size_hint: "~10k examples",
        notes: "Classic RE benchmark; 9 directed relations + OTHER; small but influential",
        categories: [relation_extraction],
    },
    FewRel {
        name: "FewRel",
        description: "Few-shot relation classification benchmark. 100 relations from Wikidata.",
        url: "https://raw.githubusercontent.com/thunlp/FewRel/master/data/val_wiki.json",
        entity_types: ["head", "tail"],  // Relation endpoint markers
        language: "en",
        domain: "wikipedia",
        license: "MIT",
        citation: "Han et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1514/",
        year: 2018,
        format: "JSONL",
        size_hint: "70k instances, 100 relations",
        notes: "N-way K-shot evaluation; Wikidata relations; FewRel 2.0 adds domain adaptation",
        hf_id: "few_rel",
        categories: [relation_extraction],
    },
    NYT10 {
        name: "NYT-10",
        description: "New York Times distant supervision RE. 24 Freebase relations.",
        url: "http://iesl.cs.umass.edu/riedel/ecml/",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Riedel et al. (2010)",
        paper_url: "https://aclanthology.org/W10-1001/",
        year: 2010,
        format: "Custom",
        size_hint: "~266k sentences",
        notes: "Distant supervision using Freebase alignment; distantly supervised; noisy labels; majority no_relation; standard DS-RE benchmark",
        splits: ["train", "test"],
        tasks: ["re"],
        categories: [relation_extraction],
    },

    // =========================================================================
    // Additional Biomedical
    // =========================================================================
    JNLPBA {
        name: "JNLPBA",
        description: "JNLPBA 2004 shared task. Bio-entity recognition in PubMed abstracts.",
        url: "https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/JNLPBA/test.tsv",
        entity_types: ["PROTEIN", "DNA", "RNA", "CELL_TYPE", "CELL_LINE"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Kim et al. (2004)",
        paper_url: "https://aclanthology.org/W04-1213/",
        year: 2004,
        format: "CoNLL",
        annotation_scheme: "IOB2",
        size_hint: "~2,400 abstracts",
        notes: "Extended GENIA categories; foundational bioNER benchmark",
        categories: [ner, biomedical],
    },
    S800 {
        name: "S800",
        description: "Species-800 corpus. Species name recognition in biomedical text.",
        url: "https://species.jensenlab.org/files/S800-1.0.tar.gz",
        entity_types: ["SPECIES"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Pafilis et al. (2013)",
        paper_url: "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0065390",
        year: 2013,
        format: "XML",
        size_hint: "800 abstracts",
        notes: "Species NER; taxonomy normalization; tar.gz archive with XML format; requires manual extraction and conversion to CoNLL",
        access_status: DependsOnOther,
        categories: [ner, biomedical],
    },

    // =========================================================================
    // Temporal NER (Time expressions, Events)
    // =========================================================================
    TempEval3 {
        name: "TempEval-3",
        description: "Temporal annotation benchmark. TIMEX, EVENT spans, and temporal relations.",
        url: "https://figshare.com/articles/dataset/TempEval-3_data/9586532",
        entity_types: ["TIMEX", "EVENT"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "UzZaman et al. (2013)",
        paper_url: "https://aclanthology.org/S13-2001/",
        year: 2013,
        format: "TimeML",
        notes: "Time expression NER + event detection + temporal ordering; TimeBank based; TE3-Platinum gold standard",
        splits: ["train", "test"],
        tasks: ["ner", "temporal"],
        categories: [ner],
    },
    TimeBank12 {
        name: "TimeBank 1.2",
        description: "Canonical temporal IE corpus. News articles with TIMEX3, events, and temporal links (TLINKs).",
        url: "https://catalog.ldc.upenn.edu/LDC2006T08",
        entity_types: ["TIMEX3", "EVENT", "SIGNAL"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Pustejovsky et al. (2003)",
        paper_url: "https://aclanthology.org/W03-1808/",
        year: 2003,
        format: "TimeML",
        size_hint: "183 news documents, ~9k events",
        notes: "Original TimeML corpus; basis for TempEval shared tasks; temporal ordering gold standard",
        splits: ["train", "test"],
        tasks: ["ner", "temporal", "events"],
        categories: [ner],
    },
    MATRES {
        name: "MATRES",
        description: "Multi-Axis Temporal Relations. Cleaner, more consistent event-event temporal relation annotations.",
        url: "https://github.com/qiangning/MATRES",
        entity_types: ["EVENT"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Ning et al. (2018)",
        paper_url: "https://aclanthology.org/P18-1212/",
        year: 2018,
        format: "Custom",
        size_hint: "~13.5k temporal relation pairs",
        notes: "Re-annotated TimeBank/AQUAINT subset; higher inter-annotator agreement; verb-centric",
        splits: ["train", "dev", "test"],
        tasks: ["temporal", "events", "re"],
        categories: [ner, relations],
    },
    THYME {
        name: "THYME",
        description: "Temporal Histories of Your Medical Events. Clinical temporal IE with events and relations.",
        url: "",
        entity_types: ["EVENT", "TIMEX3", "SECTIONTIME", "DOCTIME"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Styler et al. (2014)",
        paper_url: "https://aclanthology.org/L14-1393/",
        year: 2014,
        format: "Custom",
        size_hint: "~600 clinical notes (colon cancer, brain cancer)",
        notes: "THYME guidelines; clinical events, temporal expressions, narrative containers; Clinical TempEval basis",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "temporal", "events"],
        access_status: Registration,
        categories: [ner, clinical, biomedical],
    },
    I2B2Temporal {
        name: "i2b2 2012 Temporal",
        description: "Clinical temporal relations challenge. Events, TIMEX3, and TLINKs in discharge summaries.",
        url: "",
        entity_types: ["EVENT", "TIMEX3"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Sun et al. (2013)",
        paper_url: "https://aclanthology.org/S13-2035/",
        year: 2012,
        format: "Custom",
        size_hint: "~310 clinical notes",
        notes: "i2b2 2012 challenge; requires DUA; clinical temporal relation extraction benchmark",
        splits: ["train", "test"],
        tasks: ["ner", "temporal", "re"],
        access_status: Registration,
        categories: [ner, clinical, biomedical, relations],
    },

    // =========================================================================
    // Multimodal NER
    // =========================================================================
    Twitter2015MNER {
        name: "Twitter-2015 MNER",
        description: "Multimodal NER on Twitter. Text + image for entity recognition.",
        url: "https://github.com/jefferyYu/UMT",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "social_media",
        license: "Research",
        citation: "Zhang et al. (2018)",
        paper_url: "https://aclanthology.org/N18-1078/",
        year: 2018,
        format: "CoNLL",
        size_hint: "~8,000 tweets with images",
        notes: "Multimodal; images via Google Drive archive; UMT preprocessing; first MNER dataset; visual context aids entity recognition",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "mner"],
        categories: [ner, social_media, multimodal],
    },

    // =========================================================================
    // Music Domain
    // =========================================================================
    DistantListeningCorpus {
        name: "Distant Listening Corpus",
        description: "1,283 musical scores with harmonic annotations. String quartet + piano music with Roman numeral analysis.",
        url: "https://zenodo.org/records/15150283",
        entity_types: ["CHORD", "KEY", "MODULATION", "CADENCE", "PHRASE"],
        language: "mul",
        domain: "music",
        license: "CC-BY-4.0",
        citation: "Devaney et al. (2024)",
        paper_url: "https://doi.org/10.5281/zenodo.15150283",
        year: 2024,
        format: "TSV",
        size_hint: "1,283 scores, 190k+ annotations",
        notes: "Music theory annotation corpus; Roman numeral analysis; supports harmonic sequence extraction; Zenodo archive",
        splits: ["train"],
        tasks: ["sequence_labeling", "harmonic_analysis"],
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Privacy/PII Detection
    // =========================================================================
    PIIMasking200k {
        name: "PII Masking 200k",
        description: "200k synthetic examples for PII detection and masking. Covers 50+ PII types.",
        url: "https://huggingface.co/datasets/ai4privacy/pii-masking-200k",
        entity_types: ["EMAIL", "PHONE", "SSN", "ADDRESS", "NAME", "DOB", "CREDIT_CARD", "PASSPORT", "IP_ADDRESS", "LICENSE"],
        language: "mul",
        domain: "privacy",
        license: "Apache-2.0",
        citation: "AI4Privacy (2024)",
        year: 2024,
        format: "JSONL",
        size_hint: "~200k examples",
        notes: "Synthetic PII dataset; multi-language; 50+ entity types; useful for privacy compliance testing",
        splits: ["train"],
        tasks: ["ner", "pii_detection"],
        hf_id: "ai4privacy/pii-masking-200k",
        categories: [ner],
    },

    // =========================================================================
    // Legal/SEC Domain
    // =========================================================================
    ENERSec {
        name: "E-NER SEC",
        description: "Legal NER from SEC EDGAR filings. 52 documents with financial entity annotations.",
        url: "https://github.com/jnishii/E-NER",
        entity_types: ["ORG", "LOC", "DATE", "MONEY", "PERCENT", "PERSON", "PRODUCT", "CARDINAL"],
        language: "en",
        domain: "legal",
        license: "MIT",
        citation: "Nishii et al. (2023)",
        year: 2023,
        format: "CSV",
        size_hint: "52 documents, ~400k tokens",
        notes: "SEC 10-K and 10-Q filings; financial regulatory domain; legal entity extraction",
        splits: ["train", "test"],
        tasks: ["ner"],
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Entity Linking / Named Entity Disambiguation Datasets
    // =========================================================================
    MSNBCEL {
        name: "MSNBC",
        description: "Small news article entity linking dataset. Commonly used for out-of-domain EL evaluation.",
        url: "",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Cucerzan (2007)",
        paper_url: "https://aclanthology.org/D07-1074/",
        year: 2007,
        format: "Custom",
        size_hint: "~20 docs, ~700 mentions",
        notes: "Early EL benchmark; often used as OOD test set alongside AIDA",
        splits: ["test"],
        tasks: ["el", "entity_linking", "ned"],
        access_status: ContactAuthors,
        categories: [entity_linking],
    },
    AQUAINT {
        name: "AQUAINT",
        description: "Newswire entity linking dataset from AQUAINT corpus. Wikipedia-linked mentions.",
        url: "",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Milne & Witten (2008)",
        year: 2008,
        format: "Custom",
        size_hint: "~50 docs, ~700 mentions",
        notes: "Commonly paired with AIDA for comprehensive EL evaluation",
        splits: ["test"],
        tasks: ["el", "entity_linking", "ned"],
        access_status: Registration,
        categories: [entity_linking],
    },
    KORE50 {
        name: "KORE50",
        description: "Short, highly ambiguous entity linking snippets. Tests disambiguation difficulty.",
        url: "https://github.com/KORE50/KORE50-NIF-NER",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "evaluation",
        license: "CC-BY-4.0",
        citation: "Hoffart et al. (2012)",
        paper_url: "https://aclanthology.org/P12-1084/",
        year: 2012,
        format: "Custom",
        size_hint: "50 sentences, 144 mentions",
        notes: "Highly ambiguous mentions; stress-tests disambiguation ability; includes YAGO types",
        splits: ["test"],
        tasks: ["el", "entity_linking", "ned"],
        categories: [entity_linking, adversarial],
    },
    WNEDWiki {
        name: "WNED-WIKI",
        description: "Large-scale Wikipedia entity linking dataset extracted from Wikipedia hyperlinks.",
        url: "https://github.com/wikipedia2vec/wikipedia2vec",
        entity_types: ["ENTITY"],
        language: "en",
        domain: "wikipedia",
        license: "Research",
        citation: "Guo & Barbosa (2018)",
        year: 2018,
        format: "Custom",
        size_hint: "~6M mentions",
        notes: "Large-scale silver annotations from Wikipedia hyperlinks",
        splits: ["test"],
        tasks: ["el", "entity_linking"],
        categories: [entity_linking],
    },
    WNEDClueweb {
        name: "WNED-ClueWeb",
        description: "Web-scale entity linking from ClueWeb corpus. Tests EL on noisy web text.",
        url: "",
        entity_types: ["ENTITY"],
        language: "en",
        domain: "web",
        license: "Research",
        citation: "Guo & Barbosa (2018)",
        year: 2018,
        format: "Custom",
        size_hint: "~10k docs",
        notes: "Web-scale EL benchmark; tests robustness on noisy web text",
        splits: ["test"],
        tasks: ["el", "entity_linking"],
        access_status: Registration,
        categories: [entity_linking],
    },
    BELB {
        name: "BELB",
        description: "Biomedical Entity Linking Benchmark unifying 11 corpora across 7 knowledge bases. Standardized biomedical EL evaluation.",
        url: "https://github.com/sg-wbi/belb",
        entity_types: ["Disease", "Chemical", "Gene", "Species", "CellLine", "Variant"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Furrer et al. (2023)",
        paper_url: "https://academic.oup.com/bioinformatics/article/39/11/btad698/7425450",
        year: 2023,
        format: "JSONL",
        size_hint: "11 corpora, 7 KBs",
        notes: "Unifies BC5CDR-Chemical, BC5CDR-Disease, NCBI-Disease, BC2GN, NLM-Gene, Linnaeus, S800, GNORMPLUS, MedMentions, and more",
        splits: ["train", "dev", "test"],
        tasks: ["el", "entity_linking", "ned"],
        categories: [entity_linking, biomedical],
    },
    MELO {
        name: "MELO",
        description: "Multilingual Entity Linking of Occupations. 48 datasets across 21 languages for occupation EL.",
        url: "https://github.com/avature/melo-benchmark",
        entity_types: ["OCCUPATION"],
        language: "mul",
        domain: "general",
        license: "Apache-2.0",
        citation: "Retyk et al. (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.889/",
        year: 2024,
        format: "JSONL",
        size_hint: "48 datasets, 21 languages",
        notes: "Zero-shot multilingual EL; includes sentence encoders and lexical baselines",
        splits: ["test"],
        tasks: ["el", "entity_linking"],
        categories: [entity_linking, multilingual],
    },

    MewsliX {
        name: "Mewsli-X",
        description: "Multilingual entity linking across 50 languages. Wikipedia-linked mentions for zero-shot cross-lingual EL.",
        url: "https://huggingface.co/datasets/izhx/mewsli-x",
        entity_types: ["ENTITY"],  // Wikipedia entities
        language: "mul",
        domain: "news",
        license: "Apache-2.0",
        citation: "Botha et al. (2020)",
        paper_url: "https://arxiv.org/abs/2010.11856",
        year: 2020,
        format: "TSV",
        size_hint: "~300k mentions across 50 languages",
        notes: "Zero-shot cross-lingual EL benchmark; from WikiNews; Wikipedia KB",
        splits: ["test"],
        tasks: ["el", "entity_linking", "ned"],
        access_status: Public,
        categories: [entity_linking, multilingual],
    },

    // =========================================================================
    // Long-Document Coreference
    // =========================================================================
    // NOTE: BookCoref (sapienzanlp) is defined earlier (~line 2035). This is a different dataset.
    BookCorefBamman {
        name: "BookCoref (Bamman)",
        description: "Full-novel coreference with automatic silver and manual gold annotations. Includes Animal Farm, Siddhartha, Pride and Prejudice.",
        url: "https://huggingface.co/datasets/spacemanidol/BookCoref",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "literature",
        license: "Research",
        citation: "Bamman et al. (2025)",
        paper_url: "https://arxiv.org/abs/2507.12075",
        year: 2025,
        format: "JSONL",
        size_hint: "~200k tokens per document",
        notes: "Long-document coref benchmark; tests models on full novels; silver + gold annotations. HF-hosted but appears gated in practice; treat as manual unless you have explicit access.",
        splits: ["test"],
        tasks: ["coref"],
        hf_id: "spacemanidol/BookCoref",
        access_status: ContactAuthors,
        categories: [coref, literary, long_document],
    },
    NovelCR {
        name: "NovelCR",
        description: "Large-scale bilingual (EN/ZH) novel coreference. 148k EN mentions, 311k ZH mentions with 74-83% spanning 3+ sentences.",
        url: "https://github.com/NovelCR/NovelCR",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "literature",
        license: "Research",
        citation: "Chen et al. (2024)",
        paper_url: "https://openreview.net/forum?id=zuZXwj9aSE",
        year: 2024,
        format: "JSONL",
        size_hint: "EN: 148k mentions, ZH: 311k mentions",
        notes: "Long-span coreference; bilingual EN/ZH; most coreferences span multiple sentences",
        splits: ["train", "dev", "test"],
        tasks: ["coref"],
        categories: [coref, literary, long_document, multilingual],
    },
    AgCNER {
        name: "AgCNER",
        description: "Large-scale Chinese agricultural NER. 66k samples, ~207k entities, 3.9M characters.",
        url: "https://springernature.figshare.com/collections/AgCNER_the_First_Large-Scale_Chinese_Named_Entity_Recognition_Dataset_for_Agricultural_Diseases_and_Pests/6807873",
        entity_types: ["CROP", "DISEASE", "PEST", "CHEMICAL", "VARIETY", "LOCATION", "TIME"],
        language: "zh",
        domain: "scientific",
        license: "CC-BY-4.0",
        citation: "AgCNER Team (2024)",
        paper_url: "https://www.nature.com/articles/s41597-024-03578-5",
        year: 2024,
        format: "JSONL",
        size_hint: "66k samples, ~207k entities, 3.9M characters",
        notes: "Nature Scientific Data 2024; 13 entity types; long agricultural case reports; domain NER",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, long_document, multilingual, arcane_domain],
    },
    ScrollsQMSum {
        name: "SCROLLS QMSum",
        description: "Long-document QA from SCROLLS benchmark. Query-focused meeting summarization.",
        url: "https://github.com/tau-nlp/scrolls",
        entity_types: [],
        language: "en",
        domain: "dialogue",
        license: "MIT",
        citation: "Shaham et al. (2022)",
        paper_url: "https://aclanthology.org/2022.emnlp-main.823/",
        year: 2022,
        format: "JSONL",
        size_hint: "~1.5k meeting transcripts, avg 10k tokens",
        notes: "EMNLP 2022; SCROLLS benchmark subset; long meeting transcripts; tests long-context understanding",
        splits: ["train", "dev", "test"],
        tasks: ["qa"],
        categories: [long_document, dialogue],
    },
    LongDocNER {
        name: "Long Document NER",
        description: "Long-document NER benchmark. Tests entity recognition across extended contexts.",
        url: "https://github.com/xhuang28/LongDocNER",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "mixed",
        license: "MIT",
        citation: "Huang et al. (2024)",
        year: 2024,
        format: "JSONL",
        size_hint: "~500 documents, avg 8k tokens",
        notes: "Tests long-context NER models; entity consistency across document boundaries",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, long_document],
    },
    BookSumCoref {
        name: "BookSum Coref",
        description: "Coreference annotations on book chapters from BookSum. Long literary texts.",
        url: "https://github.com/salesforce/booksum",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "literature",
        license: "Research",
        citation: "Kryscinski et al. (2022)",
        paper_url: "https://aclanthology.org/2022.findings-emnlp.438/",
        year: 2022,
        format: "JSONL",
        size_hint: "~400 chapters, avg 5k tokens",
        notes: "Book chapters with coref chains; tests long-span coreference resolution",
        splits: ["train", "test"],
        tasks: ["coref"],
        categories: [coref, long_document, literary],
    },
    MultiBioNERLong {
        name: "Multi-Bio Long NER",
        description: "Long biomedical document NER. Full-text articles vs abstracts.",
        url: "https://github.com/dmis-lab/multi-bio-ner",
        entity_types: ["GENE", "CHEMICAL", "DISEASE", "SPECIES"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Lee et al. (2023)",
        year: 2023,
        format: "JSONL",
        size_hint: "~1k full-text articles",
        notes: "Full-text vs abstract NER comparison; tests biomedical long-context models",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, long_document, biomedical],
    },
    RadCoref {
        name: "RadCoref",
        description: "Radiology report coreference from MIMIC-CXR. Clinical domain long-document coref.",
        url: "https://physionet.org/content/rad-coreference-resolution/",
        entity_types: ["ANATOMY", "OBSERVATION", "FINDING"],
        language: "en",
        domain: "clinical",
        license: "PhysioNet",
        citation: "Zhu et al. (2024)",
        paper_url: "https://physionet.org/content/rad-coreference-resolution/",
        year: 2024,
        format: "BRAT",
        size_hint: "~500 radiology reports",
        notes: "Clinical coref on MIMIC-CXR; requires PhysioNet credentialing; radiology-specific entities",
        splits: ["train", "test"],
        tasks: ["coref"],
        categories: [coref, clinical, biomedical],
    },

    // =========================================================================
    // Cross-Document Event Coreference
    // =========================================================================
    MEANTIME {
        name: "MEANTIME",
        description: "Multilingual news corpus with within- and cross-document event coreference. 4 languages.",
        url: "https://github.com/newsreader/meantime",
        entity_types: ["EVENT", "TIMEX", "PARTICIPANT", "LOCATION"],
        language: "mul",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Minard et al. (2016)",
        paper_url: "https://aclanthology.org/L16-1699/",
        year: 2016,
        format: "Custom",
        size_hint: "120 documents, 4 languages (EN, ES, IT, NL)",
        notes: "Multilingual CDEC; parallel annotations across languages; NewsReader project",
        splits: ["all"],
        tasks: ["coref", "event_coref", "cdcr"],
        categories: [coref, event_coref, multilingual],
    },
    FCCT {
        name: "FCC-T",
        description: "Football Coreference Corpus with token-level annotations. Cross-document event coref in sports news.",
        url: "https://github.com/cltl/FCC",
        entity_types: ["EVENT", "PARTICIPANT", "TIME", "LOCATION"],
        language: "en",
        domain: "sports",
        license: "CC-BY-4.0",
        citation: "Bugert et al. (2021)",
        paper_url: "https://direct.mit.edu/coli/article/47/3/575/102774",
        year: 2021,
        format: "CoNLL",
        size_hint: "~300 docs",
        notes: "Token-level CDEC; compatible with ECB+ and GVC; sports domain temporal reasoning",
        splits: ["train", "dev", "test"],
        tasks: ["coref", "event_coref", "cdcr"],
        categories: [coref, event_coref],
    },
    LEMONADE {
        name: "LEMONADE",
        description: "Large-scale multilingual conflict event corpus. 39k events across 20 languages for CDEC search.",
        url: "https://github.com/lemonade-coref/lemonade",
        entity_types: ["EVENT", "PARTICIPANT", "LOCATION", "TIME"],
        language: "mul",
        domain: "news",
        license: "Research",
        citation: "Eirew et al. (2025)",
        year: 2025,
        format: "JSONL",
        size_hint: "~39k events, 20 languages, 171 countries",
        notes: "Conflict event CDEC; cross-document event coreference search task; multilingual",
        splits: ["test"],
        tasks: ["coref", "event_coref", "cdcr"],
        categories: [coref, event_coref, multilingual],
    },

    // =========================================================================
    // Newer Biomedical/Clinical NER and RE
    // =========================================================================
    BioRED {
        name: "BioRED",
        description: "Document-level biomedical RE with novelty labels. BioCreative VIII shared task benchmark.",
        url: "https://ftp.ncbi.nlm.nih.gov/pub/lu/BioRED/",
        entity_types: ["Gene", "Disease", "Chemical", "Species", "Variant", "CellLine"],
        language: "en",
        domain: "biomedical",
        license: "Public",
        citation: "Luo et al. (2022)",
        paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baae069/7729400",
        year: 2022,
        format: "Custom",
        size_hint: "600 PubMed abstracts, 8 relation types",
        notes: "Document-level RE with novelty detection; distinguishes novel vs known relations",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "re", "relation_extraction"],
        categories: [ner, relation_extraction, biomedical],
    },
    MedMentions {
        name: "MedMentions",
        description: "Large-scale biomedical concept mentions mapped to UMLS. PubMed abstracts with fine-grained semantic types.",
        url: "https://github.com/chanzuckerberg/MedMentions",
        entity_types: ["UMLS_CONCEPT"],
        language: "en",
        domain: "biomedical",
        license: "CC0-1.0",
        citation: "Mohan & Li (2019)",
        paper_url: "https://arxiv.org/abs/1902.09476",
        year: 2019,
        format: "Custom",
        size_hint: "4,392 abstracts, 352k mentions, 35k concepts",
        notes: "UMLS concept linking; 127 semantic types; large-scale biomedical concept NER/EL",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "el", "entity_linking"],
        categories: [ner, entity_linking, biomedical],
    },
    EnzChemRED {
        name: "EnzChemRED",
        description: "Enzyme chemistry relation extraction. Links enzymes, substrates, products, cofactors from biochemical literature.",
        url: "https://github.com/ncbi-nlp/EnzChemRED",
        entity_types: ["Enzyme", "Substrate", "Product", "Cofactor", "Reaction"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Schröder et al. (2024)",
        paper_url: "https://www.nature.com/articles/s41597-024-03835-7",
        year: 2024,
        format: "JSONL",
        size_hint: "~5k relation triplets",
        notes: "Specialized enzyme chemistry RE; biochemical reaction extraction",
        splits: ["train", "test"],
        tasks: ["ner", "re", "relation_extraction"],
        categories: [ner, relation_extraction, biomedical],
    },
    NCERB {
        name: "NCERB",
        description: "Named Clinical Entity Recognition Benchmark. Multi-dataset clinical NER evaluation suite.",
        url: "https://github.com/NCERB/NCERB",
        entity_types: ["Problem", "Treatment", "Test", "Medication", "Anatomy"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Zhou et al. (2024)",
        paper_url: "https://arxiv.org/abs/2410.05046",
        year: 2024,
        format: "Custom",
        size_hint: "Multiple clinical corpora aggregated",
        notes: "Benchmark suite for clinical NER; evaluates LMs on healthcare entities; aggregates i2b2, n2c2, etc.",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, clinical, biomedical],
    },
    MACCROBAT {
        name: "MACCROBAT",
        description: "Biomedical NER corpus with extensive coverage. Used with RoBERTa-WWM and deep models.",
        url: "https://figshare.com/articles/dataset/MACCROBAT2018/9764942",
        entity_types: ["Disease", "Chemical", "Gene", "Species"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Islamaj et al. (2019)",
        year: 2019,
        format: "Custom",
        size_hint: "~400 abstracts",
        notes: "Multi-type biomedical NER; chemical and disease mentions",
        splits: ["train", "test"],
        tasks: ["ner"],
        categories: [ner, biomedical],
    },

    // =========================================================================
    // Additional Relation Extraction Datasets
    // =========================================================================
    ACE05RE {
        name: "ACE 2005 RE",
        description: "ACE 2005 relation extraction component. 7 entity types, 6 relation types with subtypes.",
        url: "",
        entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "VEH", "WEA"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Walker et al. (2006)",
        year: 2005,
        format: "XML",
        size_hint: "~600 docs, 7 relation types",
        notes: "Classic RE benchmark; requires LDC license; often used with ACE NER",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "re", "relation_extraction"],
        access_status: Registration,
        categories: [ner, relation_extraction],
    },
    CoNLL04RE {
        name: "CoNLL04 RE",
        description: "Sentence-level relation extraction from CoNLL-2004. Clean, small RE benchmark.",
        url: "https://github.com/bekou/multihead_joint_entity_relation_extraction",
        entity_types: ["PER", "ORG", "LOC", "Other"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Roth & Yih (2004)",
        paper_url: "https://aclanthology.org/W04-2401/",
        year: 2004,
        format: "CoNLL",
        size_hint: "~1.4k sentences, 5 relation types",
        notes: "Clean sentence-level RE; joint NER+RE evaluation",
        splits: ["train", "test"],
        tasks: ["ner", "re", "relation_extraction"],
        categories: [ner, relation_extraction],
    },
    CrossRE {
        name: "CrossRE",
        description: "Cross-domain relation extraction across 6 domains. Tests RE generalization.",
        url: "https://github.com/mainlp/CrossRE",
        entity_types: ["PER", "ORG", "LOC", "MISC"],
        language: "en",
        domain: "cross_domain",
        license: "CC-BY-4.0",
        citation: "Bassignana & Plank (2022)",
        paper_url: "https://aclanthology.org/2022.emnlp-main.452/",
        year: 2022,
        format: "JSON",
        size_hint: "6 domains: AI, Literature, Music, News, Politics, Science",
        notes: "Cross-domain RE evaluation; tests transfer across domains",
        splits: ["train", "dev", "test"],
        tasks: ["re", "relation_extraction"],
        categories: [relation_extraction],
    },

    // =========================================================================
    // Unified Multilingual NER
    // =========================================================================
    UNER {
        name: "UNER",
        description: "Universal NER on Universal Dependencies. Gold NER with unified schema across 13 languages.",
        url: "https://github.com/UniversalNER/UNER",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "general",
        license: "CC-BY-SA-4.0",
        citation: "Mayhew et al. (2024)",
        paper_url: "https://aclanthology.org/2024.naacl-long.243/",
        year: 2024,
        format: "CoNLLU",
        size_hint: "13 languages including Cebuano, Tagalog, Narabizi",
        notes: "Unified NER on UD treebanks; includes low-resource languages; community-driven expansion",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, multilingual, low_resource],
    },
    IndicNER {
        name: "IndicNER",
        description: "Indian languages NER covering 11 Indian languages. Low-resource multilingual NER.",
        url: "https://github.com/AI4Bharat/IndicNER",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "general",
        license: "CC-BY-4.0",
        citation: "Mhaske et al. (2022)",
        paper_url: "https://aclanthology.org/2022.findings-acl.269/",
        year: 2022,
        format: "CoNLL",
        size_hint: "11 languages: Hindi, Bengali, Telugu, Tamil, Marathi, etc.",
        notes: "Indian language NER; part of AI4Bharat initiative; low-resource focus",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, multilingual, low_resource],
    },
    NorNE {
        name: "NorNE",
        description: "Norwegian NER covering Bokmål and Nynorsk. Morphologically rich language from news and parliament text.",
        url: "https://github.com/ltgoslo/norne",
        entity_types: ["PER", "LOC", "ORG", "GPE", "PROD", "EVT", "DRV"],
        language: "no",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Jørgensen et al. (2020)",
        paper_url: "https://aclanthology.org/2020.lrec-1.559/",
        year: 2020,
        format: "CoNLL",
        size_hint: "~600k tokens, both Bokmål and Nynorsk",
        notes: "Both Norwegian written forms; morphologically rich; 8 entity types",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner],
    },
    GermEval2014 {
        name: "GermEval 2014",
        description: "German NER shared task. Standard German NER benchmark with nested entities.",
        url: "https://sites.google.com/site/germaboreval2014/data",
        entity_types: ["PER", "LOC", "ORG", "OTH"],
        language: "de",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Benikova et al. (2014)",
        paper_url: "https://aclanthology.org/W14-1707/",
        year: 2014,
        format: "CoNLL",
        annotation_scheme: "BIO",
        size_hint: "~31k sentences",
        notes: "Standard German NER; includes nested/embedded entities; derived from Wikipedia and news",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, nested_ner],
    },

    // =========================================================================
    // LLM-Era Evaluation Datasets
    // =========================================================================
    ReasoningNER {
        name: "ReasoningNER",
        description: "Zero-shot NER evaluation suite across 20 diverse datasets. Tests LLM NER capabilities.",
        url: "https://github.com/reasoning-ner/reasoning-ner",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "evaluation",
        license: "Research",
        citation: "Xia et al. (2025)",
        paper_url: "https://arxiv.org/abs/2511.11978",
        year: 2025,
        format: "JSONL",
        size_hint: "20 datasets across news, social, biomedical, etc.",
        notes: "Zero-shot NER evaluation; tests instruction-following and entity reasoning in LLMs",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, adversarial],
    },
    BioNERLLaMA {
        name: "BioNER-LLaMA",
        description: "Instruction-tuned biomedical NER benchmark. Evaluates generative models on disease/chemical/gene NER.",
        url: "https://github.com/BIDS-Xu-Lab/BioNER-LLaMA",
        entity_types: ["Disease", "Chemical", "Gene"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Keloth et al. (2024)",
        paper_url: "https://academic.oup.com/bioinformatics/article/40/4/btae163/7633405",
        year: 2024,
        format: "JSONL",
        size_hint: "Instruction-formatted from BC5CDR, NCBI, etc.",
        notes: "LLM instruction-tuning for BioNER; evaluates ChatGPT, LLaMA, etc. on biomedical entities",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, biomedical],
    },
    MentionResolutionLLM {
        name: "Mention Resolution LLM",
        description: "MCQ-format coreference for LLMs from LitBank and FantasyCoref. Tests referential understanding on narratives.",
        url: "https://github.com/mention-resolution/mention-resolution-llm",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "literature",
        license: "Research",
        citation: "Adams et al. (2024)",
        paper_url: "https://arxiv.org/abs/2411.07466",
        year: 2024,
        format: "JSONL",
        size_hint: "MCQ from LitBank + FantasyCoref",
        notes: "Multiple-choice coref for LLM evaluation; tests ambiguous, long-distance, nested mentions",
        splits: ["test"],
        tasks: ["coref"],
        categories: [coref, literary],
    },

    // =========================================================================
    // Additional Discontinuous/Clinical NER
    // =========================================================================
    ShARe2013 {
        name: "ShARe 2013",
        description: "Clinical disorder mentions from ShARe/CLEF eHealth 2013. Discontinuous entity annotations.",
        url: "",
        entity_types: ["DISORDER"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Pradhan et al. (2013)",
        paper_url: "https://aclanthology.org/S13-2056/",
        year: 2013,
        format: "Custom",
        size_hint: "~300 clinical notes",
        notes: "Clinical NER with discontinuous spans; shared task at CLEF eHealth",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "discontinuous-ner"],
        access_status: Registration,
        categories: [ner, discontinuous_ner, clinical, biomedical],
    },
    ShARe2014 {
        name: "ShARe 2014",
        description: "Clinical disorder mentions from ShARe/CLEF eHealth 2014. Improved discontinuous NER annotations.",
        url: "",
        entity_types: ["DISORDER", "ANATOMY", "MODIFIER"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Mowery et al. (2014)",
        paper_url: "https://aclanthology.org/S14-2007/",
        year: 2014,
        format: "Custom",
        size_hint: "~400 clinical notes",
        notes: "Improved clinical discontinuous NER; attribute normalization",
        splits: ["train", "test"],
        tasks: ["ner", "discontinuous-ner"],
        access_status: Registration,
        categories: [ner, discontinuous_ner, clinical, biomedical],
    },
    I2B2_2010 {
        name: "i2b2 2010",
        description: "Clinical concept extraction and assertion classification. Foundational clinical NER benchmark.",
        url: "",
        entity_types: ["PROBLEM", "TREATMENT", "TEST"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "Uzuner et al. (2011)",
        paper_url: "https://academic.oup.com/jamia/article/18/5/552/833880",
        year: 2010,
        format: "Custom",
        size_hint: "~871 discharge summaries",
        notes: "Foundational clinical NER; requires i2b2/n2c2 data use agreement",
        splits: ["train", "test"],
        tasks: ["ner"],
        access_status: Registration,
        categories: [ner, clinical, biomedical],
    },

    // =========================================================================
    // Legal Domain
    // =========================================================================
    LexGLUENER {
        name: "LexGLUE NER",
        description: "Legal NER from LexGLUE benchmark. Legal entity extraction from case law and contracts.",
        url: "https://github.com/coastalcph/lex-glue",
        entity_types: ["PERSON", "ORGANIZATION", "LOCATION", "DATE", "LEGAL_REF", "COURT"],
        language: "en",
        domain: "legal",
        license: "Research",
        citation: "Chalkidis et al. (2022)",
        paper_url: "https://aclanthology.org/2022.acl-long.297/",
        year: 2022,
        format: "JSONL",
        size_hint: "Part of LexGLUE benchmark suite",
        notes: "Legal domain benchmark; includes contracts, case law, legislation",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "classification"],
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Financial Domain
    // =========================================================================
    FinBenNER {
        name: "FinBen NER",
        description: "Financial NER from FinBen benchmark. Entity extraction from financial documents and filings.",
        url: "https://github.com/TheFinAI/FinBen",
        entity_types: ["COMPANY", "PERSON", "MONEY", "PERCENT", "DATE", "PRODUCT"],
        language: "en",
        domain: "financial",
        license: "Research",
        citation: "Xie et al. (2024)",
        paper_url: "https://arxiv.org/abs/2402.12659",
        year: 2024,
        format: "JSONL",
        size_hint: "Multi-task financial benchmark",
        notes: "Financial IE benchmark; includes NER, classification, QA; 2024 NeurIPS",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, arcane_domain],
    },
    FiNER139 {
        name: "FiNER-139",
        description: "Financial NER with 139 fine-grained entity types. SEC 10-K/10-Q filings.",
        url: "https://huggingface.co/datasets/nlpaueb/finer-139",
        entity_types: ["COMPANY", "EXECUTIVE", "SUBSIDIARY", "PRODUCT", "REGULATION", "FINANCIAL_METRIC"],
        language: "en",
        domain: "financial",
        license: "MIT",
        citation: "Shah et al. (2023)",
        year: 2023,
        format: "JSONL",
        size_hint: "~10k sentences, 139 entity types",
        notes: "Fine-grained financial NER; hierarchical entity types; SEC filings",
        splits: ["train", "test"],
        tasks: ["ner"],
        hf_id: "nlpaueb/finer-139",
        categories: [ner, nested_ner, arcane_domain],
    },

    // =========================================================================
    // Constructed Languages
    // Constructed language NLP is valuable for:
    // - Testing language universals vs learned biases
    // - Evaluating cross-lingual transfer to truly novel languages
    // - Edge case testing (highly regular morphology, OVS word order, etc.)
    // =========================================================================
    TaggedPBCEsperanto {
        name: "taggedPBC Esperanto",
        description: "POS-tagged Esperanto from Parallel Bible Corpus. ~1800 sentences with word-level alignment.",
        url: "https://github.com/clab/taggedPBC",
        entity_types: ["PER", "LOC", "ORG"],
        language: "eo",
        domain: "religious",
        license: "CC-BY-4.0",
        citation: "Zeman et al. (2025)",
        paper_url: "https://arxiv.org/abs/2505.12560",
        year: 2024,
        format: "CoNLLU",
        size_hint: "~1800 sentences, New Testament",
        notes: "First large-scale annotated Esperanto corpus; cross-linguistic POS; no dedicated NER layer yet",
        splits: ["train"],
        tasks: ["ner", "sequence_labeling"],
        categories: [ner, constructed, low_resource],
    },
    TaggedPBCKlingon {
        name: "taggedPBC Klingon",
        description: "POS-tagged Klingon from Parallel Bible Corpus. OVS word order with complex verbal morphology.",
        url: "https://github.com/clab/taggedPBC",
        entity_types: ["PER", "LOC", "ORG"],
        language: "tlh",
        domain: "religious",
        license: "CC-BY-4.0",
        citation: "Zeman et al. (2025)",
        paper_url: "https://arxiv.org/abs/2505.12560",
        year: 2024,
        format: "CoNLLU",
        size_hint: "~1800 sentences, New Testament",
        notes: "Klingon has OVS word order, agglutinative verbs with suffix slots; tests non-SVO processing",
        splits: ["train"],
        tasks: ["ner", "sequence_labeling"],
        categories: [ner, constructed, low_resource],
    },
    UDEsperantoCairo {
        name: "UD Esperanto Cairo",
        description: "Universal Dependencies treebank for Esperanto. Syntax annotation without NER layer.",
        url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Esperanto-Cairo/master/eo_cairo-ud-test.conllu",
        entity_types: ["PER", "LOC", "ORG"],
        language: "eo",
        domain: "constructed_language",
        license: "CC-BY-SA-4.0",
        citation: "Wennerberg (2020)",
        paper_url: "https://universaldependencies.org/eo/index.html",
        year: 2020,
        format: "CoNLLU",
        size_hint: "2 documents (Manifesto, Cairo sample)",
        notes: "Small treebank illustrating UD annotation for Esperanto; no NER layer but suitable base for annotation",
        splits: ["test"],
        tasks: ["ner"],
        categories: [ner, constructed, low_resource],
    },
    KlingonEffectLID {
        name: "Klingon Effect LID",
        description: "Language ID dataset with 11 constructed languages. 14.2M sentences across 101 languages.",
        url: "https://wmdqs.org/submissions-2025/19.pdf",
        entity_types: [],
        language: "mul",
        domain: "general",
        license: "Research",
        citation: "Moura et al. (2025)",
        paper_url: "https://wmdqs.org/submissions-2025/19.pdf",
        year: 2025,
        format: "Custom",
        size_hint: "14.2M sentences, 101 languages (11 constructed)",
        notes: "Shows constructed languages (Esperanto, Klingon, Ido, Interlingua) outperform natural languages in LID",
        splits: ["test"],
        tasks: ["classification"],
        categories: [constructed, multilingual, adversarial],
    },
    LojbanTatoeba {
        name: "Lojban Tatoeba",
        description: "Lojban-English sentence pairs from Tatoeba. Logical language translation corpus.",
        url: "https://tatoeba.org/en/downloads",
        entity_types: [],
        language: "jbo",
        domain: "constructed_language",
        license: "CC-BY-2.0",
        citation: "Tatoeba Project (2024)",
        year: 2024,
        format: "TSV",
        size_hint: "~3k sentence pairs",
        notes: "Logical constructed language; predicate logic syntax; useful for semantic parsing studies",
        splits: ["all"],
        tasks: ["mt"],
        categories: [constructed, low_resource],
    },
    InterlingueWikipedia {
        name: "Interlingue Wikipedia",
        description: "Interlingue (Occidental) Wikipedia text corpus. International auxiliary language.",
        url: "https://dumps.wikimedia.org/iewiki/",
        entity_types: [],
        language: "ie",
        domain: "encyclopedia",
        license: "CC-BY-SA-4.0",
        citation: "Wikimedia (2024)",
        year: 2024,
        format: "XML",
        size_hint: "~4k articles",
        notes: "Western European vocabulary roots; naturalistic IAL; smaller than Esperanto Wikipedia",
        splits: ["all"],
        tasks: ["lm"],
        categories: [constructed, low_resource],
    },
    TokiPonaCorpus {
        name: "Toki Pona Corpus",
        description: "Toki Pona minimalist language corpus. 120-word language for semantic simplification.",
        url: "https://github.com/kilipan/toki-pona-corpus",
        entity_types: [],
        language: "tok",
        domain: "constructed_language",
        license: "CC0-1.0",
        citation: "Lang (2021)",
        year: 2021,
        format: "TXT",
        size_hint: "~50k tokens",
        notes: "Philosophical constructed language; only 120 words; tests compositional semantics",
        splits: ["all"],
        tasks: ["lm"],
        categories: [constructed, low_resource],
    },

    // =========================================================================
    // Recent 2024-2025 Benchmarks
    // =========================================================================
    OmniNER2025 {
        name: "OmniNER2025",
        description: "Diverse fine-grained Chinese NER covering informal text (social media, forums). Large-scale benchmark for modern NER models.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "GPE", "FAC", "PRODUCT", "EVENT"],
        language: "zh",
        domain: "social_media",
        license: "Research",
        citation: "OmniNER Team (2025)",
        paper_url: "https://dl.acm.org/doi/10.1145/3726302.3730048",
        year: 2025,
        format: "JSONL",
        size_hint: "Large-scale Chinese informal text",
        notes: "2025 benchmark for fine-grained Chinese NER; expands beyond formal text; tests LLM capabilities",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        access_status: NotYetReleased,
        categories: [ner, social_media, multilingual],
    },
    LegalCore {
        name: "LegalCore",
        description: "Event coreference in long legal documents. Long-distance cross-section event links.",
        url: "",
        entity_types: ["EVENT", "PARTICIPANT", "TIME"],
        language: "en",
        domain: "legal",
        license: "Research",
        citation: "ACL Findings (2025)",
        paper_url: "https://aclanthology.org/2025.findings-acl.1284.pdf",
        year: 2025,
        format: "JSONL",
        size_hint: "Long legal documents, largest tokens per document",
        notes: "ACL 2025; benchmarks Llama-3.1, Mistral, Qwen, GPT-4; LLMs underperform supervised baselines",
        splits: ["train", "dev", "test"],
        tasks: ["event_coref", "coref"],
        access_status: NotYetReleased,
        categories: [coref, event_coref, long_document, arcane_domain],
    },
    Zcoref {
        name: "Z-coref",
        description: "Joint coreference and zero-pronoun resolution. For languages with pro-drop (Chinese, Japanese, Korean).",
        url: "",
        entity_types: ["ZERO_PRONOUN", "ENTITY"],
        language: "mul",
        domain: "general",
        license: "Research",
        citation: "Z-coref Authors (2024)",
        paper_url: "https://arxiv.org/pdf/2504.05824",
        year: 2024,
        format: "CoNLL",
        size_hint: "Multi-language pro-drop coreference",
        notes: "Tests handling of dropped arguments; critical for CJK languages; zero anaphora resolution",
        splits: ["train", "dev", "test"],
        tasks: ["coref"],
        access_status: NotYetReleased,
        categories: [coref, multilingual, abstract_anaphora],
    },
    TikTalkCoref {
        name: "TikTalkCoref",
        description: "Chinese social media dialogue coreference. Person mentions in Douyin video comments with singleton handling.",
        url: "",
        entity_types: ["PER"],
        language: "zh",
        domain: "social_media",
        license: "Research",
        citation: "Li, Gong & Fu (2025)",
        paper_url: "https://arxiv.org/abs/2504.14321",
        year: 2025,
        format: "Custom",
        size_hint: "1,012 dialogues, 2,179 mentions, 1,435 clusters",
        notes: "First Chinese MCR dataset for social media. Maverick outperforms e2e-coref (65.5 vs 39.1 Avg.F1). High singleton rate: 44% pronouns, 34% proper names, 22% common nouns. Text-only portion; multimodal aspect out of scope.",
        splits: ["train", "dev", "test"],
        tasks: ["coref"],
        access_status: NotYetReleased,
        categories: [coref, dialogue, social_media],
    },
    MHERCL {
        name: "MHERCL",
        description: "Historical long-tail entity linking benchmark. Tests LLM behavior on rare/historical Wikidata entities.",
        url: "https://arxiv.org/html/2505.03473v1",
        entity_types: ["HISTORICAL_ENTITY"],
        language: "en",
        domain: "historical",
        license: "Research",
        citation: "MHERCL Authors (2025)",
        paper_url: "https://arxiv.org/html/2505.03473v1",
        year: 2025,
        format: "JSONL",
        size_hint: "Long-tail historical entities",
        notes: "v0.1; tests EL on niche historical entities; analyzes LLM behavior on rare entities",
        splits: ["test"],
        tasks: ["el", "entity_linking"],
        categories: [entity_linking, historical, adversarial],
    },
    SNOMEDChallenge {
        name: "SNOMED CT EL Challenge",
        description: "Clinical entity linking to SNOMED CT. From SNOMED International 2024 challenge.",
        url: "",
        entity_types: ["CLINICAL_CONCEPT"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "SNOMED International (2024)",
        paper_url: "https://www.snomed.org/news/snomed-international-announces-entity-linking-challenge-winners",
        year: 2024,
        format: "Custom",
        size_hint: "Clinical notes, SNOMED CT linked",
        notes: "2024 challenge dataset; SNOMED CT coded clinical text; benchmarks clinical EL systems",
        splits: ["train", "test"],
        tasks: ["el", "entity_linking"],
        access_status: Registration,
        categories: [entity_linking, clinical, biomedical],
    },
    ESCOSkillsEL {
        name: "ESCO Skills EL",
        description: "Entity linking for occupational skills to ESCO taxonomy. Job market domain, multilingual.",
        url: "",
        entity_types: ["SKILL"],
        language: "mul",
        domain: "general",
        license: "Research",
        citation: "EACL Findings (2024)",
        paper_url: "https://aclanthology.org/2024.findings-eacl.28/",
        year: 2024,
        format: "Custom",
        size_hint: "Skill mentions across multiple languages",
        notes: "Complements MELO; links skills (not occupations) to ESCO taxonomy; job posting text",
        splits: ["train", "test"],
        tasks: ["el", "entity_linking"],
        access_status: ContactAuthors,
        categories: [entity_linking, multilingual],
    },

    // =========================================================================
    // Bioacoustics / Non-Human Communication
    // =========================================================================
    NatureLMAudio {
        name: "NatureLM-audio",
        description: "Foundation model training collection for bioacoustics. Multi-species audio-text pairs.",
        url: "https://github.com/earthspecies/naturelm-audio",
        entity_types: ["SPECIES", "CALL_TYPE", "BEHAVIOR"],
        language: "en",
        domain: "bioacoustics",
        license: "Research",
        citation: "NatureLM Team (2024)",
        paper_url: "https://arxiv.org/abs/2411.07186",
        year: 2024,
        format: "Custom",
        size_hint: "Multi-taxon audio-text pairs (birds, marine mammals, primates)",
        notes: "Bioacoustic foundation model data; paired audio-text descriptions; cross-taxa experiments",
        splits: ["train", "test"],
        tasks: ["classification", "captioning"],
        categories: [arcane_domain, multilingual],
    },
    BEANSZero {
        name: "BEANS-Zero",
        description: "Bioacoustics benchmark beyond species classification. Natural-language prompts for animal sounds.",
        url: "https://github.com/earthspecies/beans-zero",
        entity_types: ["SPECIES", "CALL_TYPE", "INDIVIDUAL"],
        language: "en",
        domain: "bioacoustics",
        license: "Research",
        citation: "NatureLM Team (2024)",
        paper_url: "https://arxiv.org/abs/2411.07186",
        year: 2024,
        format: "Custom",
        notes: "Zero-shot transfer to unseen taxa; captioning, retrieval, instruction-following on animal vocalizations",
        splits: ["test"],
        tasks: ["classification", "retrieval"],
        categories: [arcane_domain, adversarial],
    },

    // =========================================================================
    // Chemical / Materials Science NER
    // =========================================================================
    NLMChem {
        name: "NLM-Chem",
        description: "Chemical entity recognition and normalization. Full-text PMC articles with MeSH identifiers.",
        url: "https://ftp.ncbi.nlm.nih.gov/pub/lu/NLM-Chem/",
        entity_types: ["CHEMICAL", "DRUG"],
        language: "en",
        domain: "biomedical",
        license: "Public",
        citation: "Islamaj et al. (2021)",
        paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baac102/6858529",
        year: 2021,
        format: "BRAT",
        size_hint: "~150 full-text articles, ~38k annotations",
        notes: "Gold-standard chemical NER; normalized to MeSH; used for BioCreative VII",
        splits: ["train", "test"],
        tasks: ["ner", "el"],
        categories: [ner, biomedical, entity_linking],
    },
    CHEMDNER {
        name: "CHEMDNER",
        description: "Chemical compound and drug name recognition in scientific text.",
        url: "https://biocreative.bioinformatics.udel.edu/tasks/biocreative-iv/chemdner/",
        entity_types: ["CHEMICAL", "DRUG", "ABBREVIATION"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Krallinger et al. (2015)",
        paper_url: "https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2",
        year: 2015,
        format: "BIO",
        size_hint: "~10k abstracts",
        notes: "BioCreative IV shared task; abstract-level chemical NER; foundational chemistry benchmark",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, biomedical],
    },

    // =========================================================================
    // Temporal / Event NER
    // =========================================================================
    TimeBankDense {
        name: "TimeBank-Dense",
        description: "Dense temporal relation annotation. Re-annotation of TimeBank with more consistent TLINK labels.",
        url: "https://github.com/bethard/timebank-dense",
        entity_types: ["EVENT", "TIMEX3"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Chambers et al. (2014)",
        paper_url: "https://aclanthology.org/Q14-1002/",
        year: 2014,
        format: "TimeML",
        size_hint: "~36 documents, dense annotation",
        notes: "Event-event temporal relations; BEFORE/AFTER/INCLUDES/VAGUE; timeline construction benchmark",
        splits: ["train", "dev", "test"],
        tasks: ["temporal", "event_coref"],
        categories: [ner, event_coref],
    },

    // =========================================================================
    // Multimodal NER
    // =========================================================================
    TwitterGMNER {
        name: "Twitter-GMNER",
        description: "Grounded Multimodal NER. Entities linked to bounding boxes in social media images.",
        url: "https://github.com/JinYuanLi0012/RiVEG",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Li et al. (2024)",
        paper_url: "https://aclanthology.org/2024.findings-acl.58/",
        year: 2024,
        format: "JSONL",
        size_hint: "~8k tweets with images",
        notes: "Entity mentions grounded to image regions; visual-textual entity alignment",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "grounding"],
        categories: [ner, social_media, arcane_domain],
    },
    MNERMI {
        name: "MNER-MI",
        description: "Multimodal NER with Multiple Images. Social media posts with multiple image context.",
        url: "https://github.com/NUSTM/MNER-MI",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Wang et al. (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.1001/",
        year: 2024,
        format: "JSONL",
        size_hint: "~5k tweets with multiple images",
        notes: "Multi-image context improves NER; temporal-prompt model baseline; LREC-COLING 2024",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, social_media],
    },
    TwoMNER {
        name: "2M-NER",
        description: "Multilingual Multimodal NER. Four languages with text-image pairs.",
        url: "https://github.com/Alibaba-NLP/2M-NER",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "mul",
        domain: "social_media",
        license: "Apache-2.0",
        citation: "Liu et al. (2024)",
        paper_url: "https://arxiv.org/abs/2404.17122",
        year: 2024,
        format: "JSONL",
        size_hint: "~20k examples, 4 languages (EN, FR, DE, ES)",
        notes: "Contrastive text-image alignment; multilingual multimodal NER benchmark",
        splits: ["train", "dev", "test"],
        tasks: ["ner"],
        categories: [ner, multilingual, social_media],
    },

    // =========================================================================
    // Mathematical / Scientific NER
    // =========================================================================
    MathEntities {
        name: "Mathematical Entities",
        description: "Terminology and definition extraction from mathematical text. Category theory corpora.",
        url: "https://github.com/dmazzei/mathematical-entities",
        entity_types: ["TERM", "DEFINITION", "THEOREM"],
        language: "en",
        domain: "scientific",
        license: "CC-BY-4.0",
        citation: "Mazzei et al. (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.966/",
        year: 2024,
        format: "LaTeX",
        size_hint: "~3 corpora in category theory",
        notes: "LaTeX source preservation; math-aware NER; entity linking to Wikidata/nLab",
        splits: ["train", "test"],
        tasks: ["ner", "el"],
        categories: [ner, arcane_domain, entity_linking],
    },
    SciERC {
        name: "SciERC",
        description: "Scientific information extraction from AI/ML papers. Nested entities and relations.",
        url: "https://nlp.cs.washington.edu/sciIE/",
        entity_types: ["TASK", "METHOD", "METRIC", "MATERIAL", "GENERIC", "OTHER"],
        language: "en",
        domain: "scientific",
        license: "CC-BY-4.0",
        citation: "Luan et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1360/",
        year: 2018,
        format: "JSONL",
        size_hint: "~500 abstracts",
        notes: "Canonical scientific NER + relation extraction; nested entities common",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "re"],
        categories: [ner, nested_ner, relation_extraction, arcane_domain],
    },

    // =========================================================================
    // Geospatial / Toponym NER
    // =========================================================================
    GeoWebNews {
        name: "GeoWebNews",
        description: "Geoparsing benchmark from web news. Toponyms with geocoding coordinates.",
        url: "https://github.com/milangritta/GeoWebNews",
        entity_types: ["LOC", "GPE", "FACILITY"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Gritta et al. (2020)",
        paper_url: "https://aclanthology.org/2020.lrec-1.381/",
        year: 2020,
        format: "CoNLL",
        size_hint: "~4k documents",
        notes: "Toponym recognition + resolution; GeoNames linking; web news geoparsing",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "el"],
        categories: [ner, entity_linking],
    },
    LGL {
        name: "LGL",
        description: "Local-Global Lexicon for toponym disambiguation. News articles with geolocation.",
        url: "https://github.com/wikipedia2vec/wikipedia2vec",
        entity_types: ["LOC"],
        language: "en",
        domain: "news",
        license: "MIT",
        citation: "Lieberman et al. (2010)",
        year: 2010,
        format: "Custom",
        size_hint: "~5.8k place references",
        notes: "Toponym disambiguation benchmark; local vs global context for geolocation",
        splits: ["all"],
        tasks: ["ner", "el"],
        categories: [ner, entity_linking],
    },

    // =========================================================================
    // Procedural / Recipe NER
    // =========================================================================
    TASTEset {
        name: "TASTEset",
        description: "Recipe ingredient NER. 700 annotated recipe ingredient lists with 9 entity classes.",
        url: "https://github.com/taisti/TASTEset",
        entity_types: ["INGREDIENT", "QUANTITY", "UNIT", "STATE", "SIZE", "TEMP"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "TASTEset Team (2023)",
        year: 2023,
        format: "BIO",
        size_hint: "~700 ingredient lists",
        notes: "Recipe NER benchmark; BIO/BILOU conversion utilities; BERT model pipeline",
        splits: ["train", "test"],
        tasks: ["ner"],
        categories: [ner, arcane_domain],
    },
    RecipeNER {
        name: "Recipe NER",
        description: "Deep learning recipe NER. Multi-scale datasets with ingredient and instruction entities.",
        url: "https://github.com/cosylabiiit/recipe-ner",
        entity_types: ["INGREDIENT", "QUANTITY", "UNIT", "PROCESS", "UTENSIL", "TEMP"],
        language: "en",
        domain: "food",
        license: "MIT",
        citation: "Deepgram (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.406/",
        year: 2024,
        format: "BIO",
        size_hint: "~88k phrases (6.6k manual, 26k augmented, 88k machine)",
        notes: "Three-tier dataset; baseline pipelines exist (e.g., spaCy transformer); recipe IE pipeline",
        splits: ["train", "test"],
        tasks: ["ner"],
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Code / Software Entity Recognition
    // =========================================================================
    CodeSearchNet {
        name: "CodeSearchNet",
        description: "Code understanding benchmark. Function documentation and code search across 6 languages.",
        url: "https://github.com/github/CodeSearchNet",
        entity_types: ["FUNCTION", "CLASS", "VARIABLE", "MODULE"],
        language: "mul",
        domain: "code",
        license: "MIT",
        citation: "Husain et al. (2019)",
        paper_url: "https://arxiv.org/abs/1909.09436",
        year: 2019,
        format: "JSONL",
        size_hint: "~2M functions across 6 programming languages",
        notes: "Code-docstring pairs; Python, Java, Go, PHP, JavaScript, Ruby; foundation for code NER",
        splits: ["train", "dev", "test"],
        tasks: ["retrieval"],
        categories: [arcane_domain, multilingual],
    },

    // =========================================================================
    // Fictional / Literary Entity Recognition
    // =========================================================================
    FABLE {
        name: "FABLE",
        description: "Fiction Adapted BERT for Literary Entities. DeBERTa-based NER for narrative fiction.",
        url: "https://huggingface.co/DeBERTa-literary-entities",
        entity_types: ["CHARACTER", "LOCATION", "ORGANIZATION", "ARTIFACT"],
        language: "en",
        domain: "fiction",
        license: "MIT",
        citation: "FABLE Team (2024)",
        year: 2024,
        format: "Custom",
        notes: "Literary NER model; targets invented names in fantasy/SF; trained on narrative fiction",
        categories: [ner, literary],
    },
    ELGold {
        name: "ELGold",
        description: "Gold-standard multi-genre Polish NER+EL. Includes fiction, press, blogs.",
        url: "https://mostwiedzy.pl/en/open-research-data/elgold-gold-standard-multi-genre-dataset",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "pl",
        domain: "general",
        license: "CC-BY-4.0",
        citation: "Pokrywka et al. (2025)",
        paper_url: "https://www.nature.com/articles/s41597-025-05274-4",
        year: 2025,
        format: "JSONL",
        notes: "Multi-genre including fiction; Wikipedia-linked; Polish language",
        splits: ["train", "test"],
        tasks: ["ner", "el"],
        categories: [ner, entity_linking, literary, multilingual],
    },

    // =========================================================================
    // Streaming / Temporal Entity Evolution
    // =========================================================================
    StreamingCDCoref {
        name: "Streaming CD-Coref",
        description: "Streaming cross-document entity coreference protocol. News domain streaming evaluation.",
        url: "https://www.cs.jhu.edu/~mdredze/publications/streaming_coref_coling.pdf",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Dredze et al. (2010)",
        paper_url: "https://aclanthology.org/C10-1032/",
        year: 2010,
        format: "Custom",
        notes: "Canonical streaming entity clustering; O(n) single-pass; evolving cluster representations",
        categories: [coref, long_document],
    },
    TemDocRED {
        name: "Tem-DocRED",
        description: "Temporal document-level relation extraction. Converts static triples to temporal quadruples.",
        url: "https://github.com/THUDM/Tem-DocRED",
        entity_types: ["PER", "ORG", "LOC", "TIME"],
        language: "en",
        domain: "wikipedia",
        license: "MIT",
        citation: "Zhang et al. (2024)",
        paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC12048500/",
        year: 2024,
        format: "JSONL",
        size_hint: "Re-DocRED + temporal timestamps",
        notes: "Temporal KG construction from documents; LLM + pattern mining for timestamp inference",
        splits: ["train", "dev", "test"],
        tasks: ["re", "temporal"],
        categories: [relation_extraction, long_document],
    },

    // =========================================================================
    // Scientific / Concept Coreference
    // =========================================================================
    SciCoRadar {
        name: "SciCo-Radar",
        description: "Scientific cross-document concept coreference. Dynamic definitions via LLM retrieval.",
        url: "https://github.com/allenai/scico-radar",
        entity_types: ["CONCEPT", "METHOD", "TASK", "MATERIAL"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Wadden et al. (2024)",
        paper_url: "https://arxiv.org/abs/2409.15113",
        year: 2024,
        format: "JSONL",
        notes: "Cross-doc concept coref with hierarchy; LLM-generated relational definitions improve F1",
        splits: ["train", "dev", "test"],
        tasks: ["coref"],
        categories: [coref, arcane_domain],
    },

    // =========================================================================
    // Process Mining / Event KGs
    // =========================================================================
    EventKGDrift {
        name: "Event KG Drift",
        description: "Multi-perspective concept drift detection on event knowledge graphs.",
        url: "https://research.tue.nl/files/349781334/978-3-031-61057-8_9.pdf",
        entity_types: ["EVENT", "CASE", "ACTOR", "TIME"],
        language: "en",
        domain: "evaluation",
        license: "Research",
        citation: "TU Eindhoven (2024)",
        year: 2024,
        format: "Custom",
        notes: "Actor-centric features give 2.6x stronger drift signals; temporal graph drift on EKGs",
        categories: [event_coref, long_document, arcane_domain],
    },

    // =========================================================================
    // Semantic Drift / KG Curation
    // =========================================================================
    WikidataDrift {
        name: "Wikidata Semantic Drift",
        description: "Semantic drift detection in Wikidata. LLM-based classification inconsistency detection.",
        url: "https://arxiv.org/abs/2511.04926",
        entity_types: [],
        language: "mul",
        domain: "encyclopedia",
        license: "CC0-1.0",
        citation: "Wikidata Drift Team (2024)",
        paper_url: "https://arxiv.org/abs/2511.04926",
        year: 2024,
        format: "Custom",
        notes: "Multi-dimensional semantic risk model; drift threshold ~0.6; continuous KG curation",
        categories: [entity_linking, adversarial],
    },

    // =========================================================================
    // Additional Legacy Datasets (from loader.rs)
    // =========================================================================

    AIDA {
        name: "AIDA-CoNLL (v2)",
        description: "Entity linking to Wikipedia. CoNLL-YAGO dataset for named entity disambiguation.",
        url: "https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Hoffart et al. (2011)",
        paper_url: "https://aclanthology.org/D11-1072/",
        year: 2011,
        format: "CoNLL",
        notes: "Entity linking benchmark; links CoNLL-2003 mentions to YAGO/Wikipedia",
        categories: [entity_linking],
    },

    // AIONER: Correct GitHub org is 'ncbi', not 'AIONER'
    AIONER {
        name: "AIONER",
        description: "All-in-one biomedical NER. Unified biomedical entity extraction model.",
        url: "https://github.com/ncbi/AIONER",
        entity_types: ["Gene", "Disease", "Chemical", "Species"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Luo et al. (2023)",
        year: 2023,
        format: "JSONL",
        notes: "Unified model for multiple biomedical entity types",
        categories: [ner, biomedical],
    },

    AISHELLNER {
        name: "AISHELL-NER",
        description: "Chinese speech NER from AISHELL corpus. Named entities in Mandarin speech.",
        url: "https://www.aishelltech.com/aishell_2",
        entity_types: ["PER", "LOC", "ORG"],
        language: "zh",
        domain: "speech",
        license: "Research",
        citation: "AISHELL Foundation (2017)",
        year: 2017,
        format: "Custom",
        notes: "Speech transcription NER; tests robustness to ASR errors",
        categories: [ner, speech],
    },

    AstroNER {
        name: "AstroNER",
        description: "Astronomy named entity recognition. Celestial objects and astronomical concepts.",
        url: "https://github.com/astronomical-ner/AstroNER",
        entity_types: ["CelestialObject", "Instrument", "Mission", "Phenomenon"],
        language: "en",
        domain: "astrophysics",
        license: "CC-BY-4.0",
        citation: "NASA ADS Team",
        year: 2022,
        format: "CoNLL",
        notes: "Domain-specific NER for astronomy literature",
        categories: [ner, arcane_domain],
    },

    B2NERD {
        name: "B2NERD",
        description: "Billion-scale news NER dataset. Large-scale distantly supervised NER.",
        url: "https://huggingface.co/datasets/Umean/B2NERD",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Umean (2023)",
        year: 2023,
        format: "JSONL",
        notes: "Large-scale silver-standard NER; useful for pre-training",
        categories: [ner],
    },

    BioMNER {
        name: "BioMNER",
        description: "Biomedical method NER. Scientific methods and techniques in biomedical text.",
        url: "https://huggingface.co/datasets/tner/bionlp2004",
        entity_types: ["Method", "Technique", "Protocol"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "BioNLP (2004)",
        year: 2004,
        format: "BIO",
        notes: "Biomedical methodology extraction; from BioNLP shared task",
        categories: [ner, biomedical],
    },

    LegNER {
        name: "LegNER",
        description: "Legal domain NER. Named entities in legal documents and court opinions.",
        url: "https://github.com/Liquid-Legal-Institute/LegalBench",
        entity_types: ["Court", "Judge", "Statute", "Party", "Date"],
        language: "en",
        domain: "legal",
        license: "CC-BY-4.0",
        citation: "Legal NLP Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Legal domain specialization; court documents and statutes",
        categories: [ner],
    },

    OpenNER {
        name: "OpenNER 1.0",
        description: "Open domain NER benchmark. Broad coverage across multiple domains.",
        url: "https://huggingface.co/datasets/yongsun-yoon/open-ner-english",
        entity_types: ["PER", "LOC", "ORG", "EVENT", "PRODUCT"],
        language: "en",
        domain: "mixed",
        license: "CC-BY-SA-4.0",
        citation: "Babelscape (2023)",
        year: 2023,
        format: "JSONL",
        notes: "Community mirror; open-domain NER benchmark",
        tasks: ["ner"],
        hf_id: "yongsun-yoon/open-ner-english",
        access_status: Public,
        categories: [ner],
    },

    SciNER {
        name: "SciNER",
        description: "Scientific literature NER. Entities from scientific papers across disciplines.",
        url: "https://github.com/allenai/sciner",
        entity_types: ["Method", "Task", "Dataset", "Metric", "Material"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Allen AI (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Scientific entities; paper abstracts and methods sections",
        categories: [ner],
    },

    FinanceNER {
        name: "FinanceNER",
        description: "Financial domain NER. Named entities from financial documents and news.",
        url: "https://github.com/nlpaueb/finer",
        entity_types: ["Company", "Stock", "Currency", "Amount", "Date"],
        language: "en",
        domain: "financial",
        license: "Research",
        citation: "FinNLP (2020)",
        year: 2020,
        format: "CoNLL",
        notes: "Financial entity extraction; SEC filings and news",
        categories: [ner],
    },

    TechNER {
        name: "TechNER",
        description: "Technology domain NER. Software, hardware, and technical entities.",
        url: "https://github.com/techner/techner",
        entity_types: ["Software", "Hardware", "Company", "Version", "Language"],
        language: "en",
        domain: "code",
        license: "MIT",
        citation: "TechNER Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Technology entities; Stack Overflow and documentation",
        categories: [ner],
    },

    FictionNER750M {
        name: "FictionNER-750M",
        description: "Fiction NER at scale. Named entities from 750M tokens of fiction text.",
        url: "https://huggingface.co/datasets/SaladTechnologies/fiction-ner-750m",
        entity_types: ["Character", "Location", "Object", "Organization"],
        language: "en",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "Fiction NER Team (2023)",
        year: 2023,
        format: "JSONL",
        notes: "Large-scale fiction NER; public on HuggingFace",
        tasks: ["ner"],
        hf_id: "SaladTechnologies/fiction-ner-750m",
        access_status: Public,
        categories: [ner, literary],
    },

    CharacterCodex {
        name: "Character Codex",
        description: "Character entity recognition in fiction. Literary character identification.",
        url: "https://github.com/character-codex/character-codex",
        entity_types: ["Character", "Alias", "Role"],
        language: "en",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "Character Codex Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Character tracking across narrative; aliases and roles",
        categories: [ner, literary],
    },

    MUC6 {
        name: "MUC-6",
        description: "Message Understanding Conference 6. Seminal NER and coreference dataset.",
        url: "https://catalog.ldc.upenn.edu/LDC2003T13",
        entity_types: ["ENAMEX", "TIMEX", "NUMEX"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Grishman & Sundheim (1996)",
        paper_url: "https://aclanthology.org/C96-1079/",
        year: 1996,
        format: "SGML",
        notes: "Historically significant; established NER evaluation paradigm",
        categories: [ner, historical],
    },

    MUC7 {
        name: "MUC-7",
        description: "Message Understanding Conference 7. Expanded NE types from MUC-6.",
        url: "https://catalog.ldc.upenn.edu/LDC2001T02",
        entity_types: ["ENAMEX", "TIMEX", "NUMEX"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Chinchor (1998)",
        paper_url: "https://aclanthology.org/M98-1002/",
        year: 1998,
        format: "SGML",
        notes: "Refined MUC-6 guidelines; includes satellite launch texts",
        categories: [ner, historical],
    },

    OntoNotes50 {
        name: "OntoNotes 5.0",
        description: "OntoNotes Release 5.0. Multi-genre corpus with NER, coref, and more.",
        url: "https://catalog.ldc.upenn.edu/LDC2013T19",
        entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "NORP", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"],
        language: "en",
        domain: "mixed",
        license: "LDC",
        citation: "Weischedel et al. (2013)",
        paper_url: "https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf",
        year: 2013,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        size_hint: "~2.9M words across genres",
        notes: "Gold standard for multiple NLP tasks; WSJ, broadcast, web, telephone",
        categories: [ner, coref],
    },

    GUM {
        name: "GUM",
        description: "Georgetown University Multilayer corpus. Rich annotation across 12 genres.",
        url: "https://github.com/amir-zeldes/gum",
        entity_types: ["person", "place", "organization", "time", "event"],
        language: "en",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Zeldes (2017)",
        paper_url: "https://aclanthology.org/W17-0809/",
        year: 2017,
        format: "CoNLL",
        size_hint: "~200k tokens, 12 genres",
        notes: "Multi-layer annotation; coreference, RST, entities",
        categories: [ner, coref],
    },

    TACKBP {
        name: "TAC-KBP",
        description: "TAC Knowledge Base Population. Entity linking and slot filling benchmark.",
        url: "https://tac.nist.gov/",
        entity_types: ["PER", "ORG", "GPE"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Ji et al. (2010)",
        paper_url: "https://aclanthology.org/C10-1058/",
        year: 2010,
        format: "Custom",
        notes: "Entity linking to Wikipedia/KB; slot filling for attributes",
        categories: [entity_linking],
    },

    HAREM {
        name: "HAREM",
        description: "Portuguese NER evaluation. First and Second HAREM conferences.",
        url: "https://www.linguateca.pt/HAREM/",
        entity_types: ["PESSOA", "LOCAL", "ORGANIZACAO", "TEMPO", "VALOR"],
        language: "pt",
        domain: "news",
        license: "Research",
        citation: "Santos et al. (2006)",
        paper_url: "https://www.linguateca.pt/HAREM/",
        year: 2006,
        format: "SGML",
        notes: "Portuguese NER benchmark; morphologically rich language",
        categories: [ner, multilingual],
    },

    GunViolenceCorpus {
        name: "Gun Violence Corpus (v2)",
        description: "Gun violence event extraction. Named entities and events from news.",
        url: "https://github.com/gun-violence-corpus/gvc",
        entity_types: ["Shooter", "Victim", "Weapon", "Location", "Date"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Pavlick et al. (2016)",
        year: 2016,
        format: "Custom",
        notes: "Event extraction; sensitive domain requiring careful handling",
        categories: [ner, event_coref],
    },

    // =========================================================================
    // Event Extraction (Trigger + Argument)
    // =========================================================================
    // Note: anno has event extraction capability via EventExtractor (lexicon-based + GLiNER)
    // These datasets support the Task::EventExtraction capability

    MAVEN {
        name: "MAVEN",
        description: "Massive general-domain event detection. 168 event types from Wikipedia, 4x larger than ACE.",
        url: "https://github.com/THU-KEG/MAVEN-dataset",
        entity_types: ["EVENT_TRIGGER"],  // 168 fine-grained event types
        language: "en",
        domain: "wikipedia",
        license: "MIT",
        citation: "Wang et al. (2020)",
        paper_url: "https://aclanthology.org/2020.emnlp-main.129/",
        year: 2020,
        format: "JSONL",
        annotation_scheme: "Trigger-based",
        size_hint: "~118k trigger instances, 4,480 documents, 168 event types",
        notes: "EMNLP 2020; largest general-domain ED dataset; CodaLab leaderboard available; Tsinghua Cloud/Google Drive download",
        splits: ["train", "valid", "test"],
        tasks: ["event_extraction"],
        access_status: Public,
        categories: [ner],  // Event triggers are span-based like NER
    },

    MAVENArg {
        name: "MAVEN-ARG",
        description: "MAVEN extended with event arguments and relations. Complete event extraction benchmark.",
        url: "https://github.com/THU-KEG/MAVEN-Argument",
        entity_types: ["EVENT_TRIGGER", "EVENT_ARGUMENT", "EVENT_RELATION"],
        language: "en",
        domain: "wikipedia",
        license: "MIT",
        citation: "Wang et al. (2024)",
        paper_url: "https://aclanthology.org/2024.acl-long.224/",
        year: 2024,
        format: "JSONL",
        annotation_scheme: "Trigger-Argument",
        size_hint: "~98k argument annotations, ~21k relations",
        notes: "ACL 2024; builds on MAVEN; supports ED + EAE + ERE tasks; all-in-one event understanding",
        splits: ["train", "valid", "test"],
        tasks: ["event_extraction", "relation_extraction"],
        access_status: Public,
        categories: [ner, relation_extraction],
    },

    CASIE {
        name: "CASIE",
        description: "Cybersecurity event extraction. Attack patterns, vulnerabilities, malware events.",
        url: "https://github.com/Ebiquity/CASIE",
        entity_types: ["Attack-Pattern", "Vulnerability", "Data-Breach", "Malware", "Patch"],
        language: "en",
        domain: "cybersecurity",
        license: "CC-BY-4.0",
        citation: "Satyapanich et al. (2020)",
        paper_url: "https://aclanthology.org/2020.case-1.12/",
        year: 2020,
        format: "Standoff",
        size_hint: "~1k documents, 5 event types, 26 argument roles",
        notes: "Domain-specific event extraction; cybersecurity news articles",
        splits: ["train", "dev", "test"],
        tasks: ["event_extraction", "ner"],
        access_status: Public,
        categories: [ner, arcane_domain],
    },

    RAMS {
        name: "RAMS",
        description: "Roles Across Multiple Sentences. Cross-sentence event argument extraction with 139 event types.",
        url: "https://nlp.jhu.edu/rams/",
        entity_types: ["EVENT_TRIGGER", "EVENT_ARGUMENT"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Ebner et al. (2020)",
        paper_url: "https://aclanthology.org/2020.acl-main.718/",
        year: 2020,
        format: "JSONL",
        size_hint: "~9,124 event instances, 139 event types",
        notes: "ACL 2020; tests implicit/cross-sentence arguments; requires multi-sentence reasoning",
        splits: ["train", "dev", "test"],
        tasks: ["event_extraction"],
        access_status: Public,
        categories: [ner, long_document],
    },

    SLUE {
        name: "SLUE",
        description: "Spoken Language Understanding Evaluation. NER in speech transcripts.",
        url: "https://github.com/asappresearch/slue-toolkit",
        entity_types: ["PER", "LOC", "ORG", "DATE"],
        language: "en",
        domain: "speech",
        license: "MIT",
        citation: "Shon et al. (2022)",
        paper_url: "https://aclanthology.org/2022.naacl-main.137/",
        year: 2022,
        format: "JSONL",
        notes: "End-to-end speech NER; VoxPopuli and VoxCeleb sources",
        categories: [ner, speech],
    },

    CRAFTCoref {
        name: "CRAFT Coreference",
        description: "Colorado Richly Annotated Full-Text corpus coreference. Biomedical coref.",
        url: "https://github.com/UCDenver-ccp/CRAFT",
        entity_types: ["Gene", "Protein", "Cell", "Organism"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Cohen et al. (2017)",
        paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/bax087/4621360",
        year: 2017,
        format: "Standoff",
        notes: "Full-text biomedical articles; coreference including bridging",
        categories: [coref, biomedical],
    },

    FootballCorefCorpus {
        name: "Football Coreference Corpus (v2)",
        description: "Cross-document event coreference for football matches.",
        url: "https://github.com/cltl/FCC",
        entity_types: ["Event", "Team", "Player", "Location"],
        language: "en",
        domain: "sports",
        license: "CC-BY-4.0",
        citation: "Vossen et al. (2018)",
        year: 2018,
        format: "Custom",
        notes: "Cross-document event coreference; sports domain",
        categories: [event_coref],
    },

    MultipartyDialogueCoref {
        name: "Multiparty Dialogue Coreference",
        description: "Coreference in multi-party conversations. Meeting and chat transcripts.",
        url: "https://github.com/sopan-sarkar/multiparty-dialogue-coref",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "dialogue",
        license: "CC-BY-4.0",
        citation: "Sarkar et al. (2020)",
        year: 2020,
        format: "JSONL",
        notes: "Multi-party setting; speaker identification challenges",
        categories: [coref, dialogue],
    },

    CODICRAC {
        name: "CODI-CRAC",
        description: "CODI/CRAC shared task on anaphora and coreference. Multiple languages.",
        url: "https://github.com/UniversalAnaphora/UA-CODI-CRAC",
        entity_types: ["PER", "ORG", "LOC", "Event"],
        language: "mul",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "CODI-CRAC Team (2022)",
        paper_url: "https://aclanthology.org/2022.codi-1.0/",
        year: 2022,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        notes: "Shared task data; includes bridging and discourse deixis",
        categories: [coref, multilingual],
    },

    MixRED {
        name: "MixRED",
        description: "Mixed relation extraction dataset. Multiple relation types and domains.",
        url: "https://github.com/mixred/MixRED",
        entity_types: ["PER", "ORG", "LOC"],
        language: "en",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "MixRED Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Relation extraction across multiple domains",
        categories: [relation_extraction],
    },

    CovEReD {
        name: "CovEReD",
        description: "COVID-19 relation extraction dataset. Biomedical relations from pandemic literature.",
        url: "https://github.com/covered/CovEReD",
        entity_types: ["Drug", "Disease", "Gene", "Symptom"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "CovEReD Team (2021)",
        year: 2021,
        format: "JSONL",
        notes: "COVID-19 specific; drug-disease-gene relations",
        categories: [relation_extraction, biomedical],
    },

    SciER {
        name: "SciER",
        description: "Scientific entity and relation extraction. From AI/ML papers.",
        url: "https://github.com/allenai/sciie",
        entity_types: ["Task", "Method", "Metric", "Material", "Generic"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Luan et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1360/",
        year: 2018,
        format: "JSONL",
        notes: "Scientific IE; paper abstracts with nested entities",
        categories: [ner, relation_extraction, nested_ner],
    },

    WEBNLG {
        name: "WebNLG",
        description: "Web NLG Challenge dataset. RDF-to-text generation with entity-relation triples.",
        url: "https://gitlab.com/webnlg/challenge-2017",
        entity_types: ["Entity"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-4.0",
        citation: "Gardent et al. (2017)",
        paper_url: "https://aclanthology.org/W17-3518/",
        year: 2017,
        format: "XML",
        notes: "RDF triples to natural language; 15 DBpedia categories",
        categories: [relation_extraction],
    },

    // =========================================================================
    // Ancient/Historical Language Treebanks
    // =========================================================================

    AkkadianUD {
        name: "Akkadian UD",
        description: "Universal Dependencies for Akkadian. Cuneiform texts from ancient Mesopotamia.",
        url: "https://universaldependencies.org/treebanks/akk_pisandub/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "akk",
        domain: "historical",
        license: "CC-BY-SA-4.0",
        citation: "UD Akkadian Team",
        year: 2020,
        format: "CoNLLU",
        notes: "Cuneiform script; extinct Semitic language",
        categories: [historical, ancient],
    },

    AncientHebrewUD {
        name: "Ancient Hebrew UD",
        description: "Universal Dependencies for Biblical Hebrew. Hebrew Bible text.",
        url: "https://universaldependencies.org/treebanks/hbo_ptnk/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "hbo",
        domain: "religious",
        license: "CC-BY-SA-4.0",
        citation: "UD Hebrew Team",
        year: 2019,
        format: "CoNLLU",
        notes: "Biblical Hebrew; Torah and Prophets",
        categories: [historical, ancient],
    },

    ClassicalChineseUD {
        name: "Classical Chinese UD",
        description: "Universal Dependencies for Classical/Literary Chinese. Pre-modern texts.",
        url: "https://universaldependencies.org/treebanks/lzh_kyoto/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "lzh",
        domain: "historical",
        license: "CC-BY-SA-4.0",
        citation: "UD Classical Chinese Team",
        year: 2018,
        format: "CoNLLU",
        notes: "Literary Chinese; classical texts and commentaries",
        categories: [historical, ancient],
    },

    CopticUD {
        name: "Coptic UD",
        description: "Universal Dependencies for Coptic. Late Egyptian language.",
        url: "https://universaldependencies.org/treebanks/cop_scriptorium/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "cop",
        domain: "religious",
        license: "CC-BY-SA-4.0",
        citation: "Zeldes & Schroeder (2016)",
        year: 2016,
        format: "CoNLLU",
        notes: "Coptic; Gnostic and Biblical texts",
        categories: [historical, ancient],
    },

    GothicUD {
        name: "Gothic UD",
        description: "Universal Dependencies for Gothic. Wulfila's Bible translation.",
        url: "https://universaldependencies.org/treebanks/got_proiel/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "got",
        domain: "religious",
        license: "CC-BY-NC-SA-4.0",
        citation: "PROIEL Team",
        year: 2014,
        format: "CoNLLU",
        notes: "Gothic; oldest substantial Germanic text",
        categories: [historical, ancient],
    },

    HittiteUD {
        name: "Hittite UD",
        description: "Universal Dependencies for Hittite. Ancient Anatolian language.",
        url: "https://universaldependencies.org/treebanks/hit_hittb/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "hit",
        domain: "historical",
        license: "CC-BY-SA-4.0",
        citation: "UD Hittite Team",
        year: 2021,
        format: "CoNLLU",
        notes: "Cuneiform Hittite; Bronze Age Anatolia",
        categories: [historical, ancient],
    },

    OldChurchSlavonicUD {
        name: "Old Church Slavonic UD",
        description: "Universal Dependencies for OCS. Medieval Slavic liturgical language.",
        url: "https://universaldependencies.org/treebanks/cu_proiel/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "cu",
        domain: "religious",
        license: "CC-BY-NC-SA-4.0",
        citation: "PROIEL Team",
        year: 2014,
        format: "CoNLLU",
        notes: "Oldest Slavic literary language; Cyrillic/Glagolitic",
        categories: [historical, ancient],
    },

    LatinITTB {
        name: "Latin ITTB",
        description: "Index Thomisticus Treebank. Medieval Latin theological texts.",
        url: "https://universaldependencies.org/treebanks/la_ittb/index.html",
        entity_types: ["PER", "LOC", "ORG"],
        language: "la",
        domain: "religious",
        license: "CC-BY-NC-SA-3.0",
        citation: "McGillivray et al. (2009)",
        year: 2009,
        format: "CoNLLU",
        notes: "Aquinas texts; medieval scholastic Latin",
        categories: [historical],
    },

    LatinPROIEL {
        name: "Latin PROIEL",
        description: "Pragmatic Resources in Old Indo-European Languages. Classical Latin.",
        url: "https://universaldependencies.org/treebanks/la_proiel/index.html",
        entity_types: ["PER", "LOC", "GPE"],
        language: "la",
        domain: "historical",
        license: "CC-BY-NC-SA-4.0",
        citation: "PROIEL Team",
        year: 2014,
        format: "CoNLLU",
        notes: "Vulgate, Caesar, Cicero; classical and late Latin",
        categories: [historical],
    },

    EsperantoUD {
        name: "Esperanto UD",
        description: "Universal Dependencies for Esperanto. Planned international language.",
        url: "https://universaldependencies.org/treebanks/eo_pud/index.html",
        entity_types: ["PER", "LOC", "ORG"],
        language: "eo",
        domain: "constructed_language",
        license: "CC-BY-SA-4.0",
        citation: "UD Esperanto Team",
        year: 2017,
        format: "CoNLLU",
        notes: "Constructed language; regular agglutinative morphology",
        categories: [constructed],
    },

    // =========================================================================
    // Constructed/Fictional Languages
    // =========================================================================

    Dothraki {
        name: "Dothraki",
        description: "Dothraki language corpus. Game of Thrones constructed language.",
        url: "https://wiki.dothraki.org/",
        entity_types: ["PER", "LOC"],
        language: "dlk",
        domain: "fiction",
        license: "CC-BY-SA-4.0",
        citation: "Peterson (2011)",
        year: 2011,
        format: "Custom",
        notes: "Conlang by David Peterson; SVO word order",
        categories: [constructed],
    },

    HighValyrian {
        name: "High Valyrian",
        description: "High Valyrian corpus. Game of Thrones constructed language.",
        url: "https://wiki.dothraki.org/High_Valyrian",
        entity_types: ["PER", "LOC"],
        language: "hvy",
        domain: "fiction",
        license: "CC-BY-SA-4.0",
        citation: "Peterson (2013)",
        year: 2013,
        format: "Custom",
        notes: "Highly inflected conlang; 4 genders, 8 cases",
        categories: [constructed],
    },

    Klingon {
        name: "Klingon",
        description: "Klingon language corpus. Star Trek constructed language.",
        url: "https://github.com/klingonlanguage/klingon-data",
        entity_types: ["PER", "LOC", "ORG"],
        language: "tlh",
        domain: "fiction",
        license: "Research",
        citation: "Okrand (1985)",
        year: 1985,
        format: "Custom",
        notes: "OVS word order; unique phonology; active community",
        categories: [constructed],
    },

    Quenya {
        name: "Quenya",
        description: "Quenya language corpus. Tolkien's Elvish language.",
        url: "https://eldamo.org/",
        entity_types: ["PER", "LOC"],
        language: "qya",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "Tolkien (1954)",
        year: 1954,
        format: "Custom",
        notes: "Finnish-inspired phonology; Tengwar script",
        categories: [constructed],
    },

    Navi {
        name: "Na'vi",
        description: "Na'vi language corpus. Avatar constructed language.",
        url: "https://learnnavi.org/",
        entity_types: ["PER", "LOC"],
        language: "nav",
        domain: "fiction",
        license: "Research",
        citation: "Frommer (2009)",
        year: 2009,
        format: "Custom",
        notes: "Free word order; ejectives; infixes",
        categories: [constructed],
    },

    InterslavicCorpus {
        name: "Interslavic",
        description: "Interslavic zonal auxiliary language. Constructed for Slavic intelligibility.",
        url: "https://interslavic.fun/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "isv",
        domain: "constructed_language",
        license: "CC-BY-SA-4.0",
        citation: "Interslavic Team (2006)",
        year: 2006,
        format: "Custom",
        notes: "Maximizes mutual intelligibility across Slavic languages",
        categories: [constructed],
    },

    Lojban {
        name: "Lojban",
        description: "Lojban logical language corpus. Constructed for unambiguous communication.",
        url: "https://mw.lojban.org/",
        entity_types: [],
        language: "jbo",
        domain: "constructed_language",
        license: "Public Domain",
        citation: "Cowan (1997)",
        year: 1997,
        format: "Custom",
        notes: "Predicate logic-based; completely unambiguous grammar",
        categories: [constructed],
    },

    TokiPona {
        name: "Toki Pona",
        description: "Toki Pona minimalist language corpus. 120-word philosophical language.",
        url: "https://github.com/kilipan/toki-pona-corpus",
        entity_types: [],
        language: "tok",
        domain: "constructed_language",
        license: "CC-BY-SA-4.0",
        citation: "Lang (2001)",
        year: 2001,
        format: "Custom",
        notes: "Minimalist; tests compositional semantics",
        categories: [constructed],
    },

    // =========================================================================
    // Clinical/Medical Datasets
    // =========================================================================

    I2B22010 {
        name: "i2b2-2010",
        description: "i2b2/VA 2010 NLP Challenge. Clinical concept extraction and relations.",
        url: "https://www.i2b2.org/NLP/DataSets/",
        entity_types: ["Problem", "Treatment", "Test"],
        language: "en",
        domain: "clinical",
        license: "DUA Required",
        citation: "Uzuner et al. (2011)",
        paper_url: "https://academic.oup.com/jamia/article/18/5/552/830538",
        year: 2010,
        format: "Custom",
        notes: "Clinical notes; concept and relation extraction",
        categories: [ner, relation_extraction, clinical],
    },

    I2b2Deidentification {
        name: "i2b2 De-identification",
        description: "i2b2 2014 De-identification Challenge. PHI recognition and removal.",
        url: "https://www.i2b2.org/NLP/DataSets/",
        entity_types: ["Name", "Date", "Address", "Phone", "SSN", "MRN"],
        language: "en",
        domain: "clinical",
        license: "DUA Required",
        citation: "Stubbs et al. (2015)",
        year: 2014,
        format: "Custom",
        notes: "PHI de-identification; HIPAA compliance",
        categories: [ner, clinical],
    },

    FrenchClinicalNER {
        name: "French Clinical NER",
        description: "French clinical NER from hospital records. APHP collaboration.",
        url: "https://github.com/EDS-NLP/eds-nlp",
        entity_types: ["Drug", "Disease", "Procedure", "Date"],
        language: "fr",
        domain: "clinical",
        license: "DUA Required",
        citation: "APHP Team (2022)",
        year: 2022,
        format: "Standoff",
        notes: "French clinical text; covers multiple entity types",
        categories: [ner, clinical, multilingual],
    },

    ShARe13 {
        name: "ShARe/CLEF 2013",
        description: "ShARe/CLEF eHealth 2013. Disorder mention recognition.",
        url: "https://physionet.org/content/shareclefehealth2013/",
        entity_types: ["Disorder"],
        language: "en",
        domain: "clinical",
        license: "PhysioNet",
        citation: "Suominen et al. (2013)",
        year: 2013,
        format: "Standoff",
        notes: "Clinical disorder identification; SNOMED CT normalization",
        categories: [ner, clinical, discontinuous_ner],
    },

    ShARe14 {
        name: "ShARe/CLEF 2014",
        description: "ShARe/CLEF eHealth 2014. Improved disorder normalization.",
        url: "https://physionet.org/content/shareclefehealth2014/",
        entity_types: ["Disorder"],
        language: "en",
        domain: "clinical",
        license: "PhysioNet",
        citation: "Mowery et al. (2014)",
        year: 2014,
        format: "Standoff",
        notes: "Extended from 2013; template filling and normalization",
        categories: [ner, clinical, discontinuous_ner],
    },

    // =========================================================================
    // Code-switching and Multilingual Social Media
    // =========================================================================

    CALCS {
        name: "CALCS",
        description: "Computational Approaches to Linguistic Code-Switching. Multiple language pairs.",
        url: "https://code-switching.github.io/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "social_media",
        license: "Research",
        citation: "CALCS Workshop",
        year: 2018,
        format: "CoNLL",
        notes: "Code-switching NER; Spanish-English, Hindi-English",
        categories: [ner, multilingual, social_media],
    },

    LinCE {
        name: "LinCE",
        description: "Linguistic Code-switching Evaluation. Multiple code-switching benchmarks.",
        url: "https://ritual.uh.edu/lince/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "social_media",
        license: "Research",
        citation: "Aguilar et al. (2020)",
        paper_url: "https://aclanthology.org/2020.lrec-1.223/",
        year: 2020,
        format: "CoNLL",
        notes: "Spanish-English, Hindi-English; includes NER task",
        categories: [ner, multilingual, social_media],
    },

    GLUECoS {
        name: "GLUECoS",
        description: "Code-Switching GLUE benchmark. NLU for code-switched text.",
        url: "https://github.com/microsoft/GLUECoS",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "social_media",
        license: "MIT",
        citation: "Khanuja et al. (2020)",
        paper_url: "https://aclanthology.org/2020.emnlp-main.574/",
        year: 2020,
        format: "JSONL",
        notes: "Hindi-English and Spanish-English; NLU tasks",
        categories: [ner, multilingual, social_media],
    },

    // =========================================================================
    // Additional Specialized Datasets
    // =========================================================================

    ChemDataExtractor {
        name: "ChemDataExtractor",
        description: "Chemical data extraction toolkit benchmark. Chemical NER and properties.",
        url: "https://chemdataextractor.org/",
        entity_types: ["Chemical", "Property", "Value", "Unit"],
        language: "en",
        domain: "biomedical",
        license: "MIT",
        citation: "Swain & Cole (2016)",
        year: 2016,
        format: "Custom",
        notes: "Chemical property extraction; materials science",
        categories: [ner, biomedical],
    },

    HUPD {
        name: "HUPD",
        description: "Harvard USPTO Patent Dataset. Patent application NER.",
        url: "https://github.com/suzgunmirac/hupd",
        entity_types: ["Inventor", "Assignee", "Reference", "Claim"],
        language: "en",
        domain: "legal",
        license: "Public Domain",
        citation: "Suzgun et al. (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Patent applications; technical language",
        categories: [ner],
    },

    FinTechPatent {
        name: "FinTech Patent NER",
        description: "FinTech patent entity extraction. Financial technology domain.",
        url: "https://github.com/fintech-patent-ner",
        entity_types: ["Technology", "Company", "Product", "Method"],
        language: "en",
        domain: "financial",
        license: "CC-BY-4.0",
        citation: "FinTech NER Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "FinTech patents; specialized terminology",
        categories: [ner],
    },

    WaterAgriNER {
        name: "WaterAgriNER",
        description: "Water and agriculture domain NER. Environmental science entities.",
        url: "https://github.com/wateragriner",
        entity_types: ["Crop", "Chemical", "Equipment", "Location"],
        language: "en",
        domain: "scientific",
        license: "CC-BY-4.0",
        citation: "WaterAgriNER Team (2022)",
        year: 2022,
        format: "CoNLL",
        notes: "Agricultural and water management domains",
        categories: [ner],
    },

    WIESPAstro {
        name: "WIESP Astrophysics",
        description: "WIESP 2022 Astrophysics NER. NASA ADS literature.",
        url: "https://ui.adsabs.harvard.edu/",
        entity_types: ["Mission", "Instrument", "CelestialObject", "Phenomenon"],
        language: "en",
        domain: "astrophysics",
        license: "Research",
        citation: "WIESP Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Astrophysics entities; 31 fine-grained types",
        categories: [ner, arcane_domain],
    },

    NERsocialFood {
        name: "NER Social Food",
        description: "Food-related NER from social media. Recipes and food mentions.",
        url: "https://github.com/food-ner/social",
        entity_types: ["Food", "Ingredient", "Brand", "Restaurant"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "Food NER Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Social media food mentions; informal language",
        categories: [ner, social_media],
    },

    RussianCulturalNER {
        name: "Russian Cultural NER",
        description: "Russian cultural heritage NER. Museums, artworks, cultural entities.",
        url: "https://github.com/russian-cultural-ner",
        entity_types: ["Artwork", "Artist", "Museum", "Period", "Style"],
        language: "ru",
        domain: "encyclopedia",
        license: "CC-BY-4.0",
        citation: "RuCultural Team (2022)",
        year: 2022,
        format: "CoNLL",
        notes: "Russian cultural heritage; fine-grained art types",
        categories: [ner, multilingual],
    },

    EighteenthCenturyNER {
        name: "18th Century NER",
        description: "Named entities in 18th century English text. Historical OCR challenges.",
        url: "https://github.com/Living-with-machines/",
        entity_types: ["PER", "LOC", "ORG", "DATE"],
        language: "en",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "Living with Machines (2020)",
        year: 2020,
        format: "CoNLL",
        notes: "OCR noise; historical spelling variation",
        categories: [ner, historical],
    },

    SpanishMedievalTEI {
        name: "Spanish Medieval TEI",
        description: "Medieval Spanish manuscript NER. TEI-encoded historical texts.",
        url: "https://github.com/spanish-medieval-nlp",
        entity_types: ["PER", "LOC", "ORG", "DATE"],
        language: "es",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "Spanish Medieval NLP (2021)",
        year: 2021,
        format: "XML",
        notes: "Medieval Castilian; paleographic challenges",
        categories: [ner, historical, multilingual],
    },

    MedievalCzechCharters {
        name: "Medieval Czech Charters",
        description: "Czech medieval charter NER. Historical legal documents.",
        url: "https://github.com/czech-medieval-charters",
        entity_types: ["PER", "LOC", "ORG", "DATE"],
        language: "cs",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "Czech Charter Team (2020)",
        year: 2020,
        format: "XML",
        notes: "Medieval Czech and Latin; charter formulae",
        categories: [ner, historical, multilingual],
    },

    DutchArchaeologyNER {
        name: "Dutch Archaeology NER (v2)",
        description: "Dutch archaeological excavation reports. DANS archive annotations.",
        url: "https://easy.dans.knaw.nl/",
        entity_types: ["Site", "Artifact", "Period", "Material"],
        language: "nl",
        domain: "archaeology",
        license: "CC-BY-4.0",
        citation: "DANS (2021)",
        year: 2021,
        format: "Standoff",
        notes: "Archaeological domain; ~31k annotations",
        categories: [ner, historical, multilingual],
    },

    GuaraniNER {
        name: "Guaraní NER",
        description: "Guaraní language NER. South American indigenous language.",
        url: "https://github.com/guarani-nlp",
        entity_types: ["PER", "LOC", "ORG"],
        language: "gn",
        domain: "indigenous",
        license: "CC-BY-4.0",
        citation: "Guaraní NLP Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Low-resource indigenous language; Paraguay official language",
        categories: [ner, indigenous, low_resource],
    },

    ShipiboKoniboNER {
        name: "Shipibo-Konibo NER",
        description: "Shipibo-Konibo language NER. Peruvian Amazonian language.",
        url: "https://github.com/ixa-ehu/shipibo-konibo",
        entity_types: ["PER", "LOC", "ORG"],
        language: "shp",
        domain: "indigenous",
        license: "CC-BY-4.0",
        citation: "Mager et al. (2018)",
        year: 2018,
        format: "CoNLL",
        notes: "Endangered language; ~3k speakers",
        categories: [ner, indigenous, low_resource],
    },

    NavajoMorph {
        name: "Navajo Morphology",
        description: "Navajo morphological annotation. North American indigenous language.",
        url: "https://github.com/navajo-nlp",
        entity_types: ["PER", "LOC"],
        language: "nv",
        domain: "indigenous",
        license: "Research",
        citation: "Navajo NLP Team (2020)",
        year: 2020,
        format: "CoNLLU",
        notes: "Complex verb morphology; tonal language",
        categories: [ner, indigenous, low_resource],
    },

    KoCoNovel {
        name: "KoCoNovel",
        description: "Korean character coreference in 50 modern/contemporary novels. First Korean literary coreference dataset. Four versions: Reader/Omniscient perspective × Separate/Overlapped entity treatment. 178K tokens, 19K mentions, ~1.4K entities.",
        url: "https://github.com/storidient/KoCoNovel",
        entity_types: ["PER"],
        language: "ko",
        domain: "fiction",
        license: "CC-BY-SA-4.0",
        citation: "Kim, Lee & Lee (2024)",
        paper_url: "https://arxiv.org/abs/2404.01140",
        year: 2024,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        size_hint: "178K tokens, 50 novels, 17975 sentences",
        notes: "Mention types: Pronominal 30.7%, Proper Name 22.8%, Single Noun 24.1% (kinship 9.2%, titles 3.1%), Noun Phrase 22.4%. Korean address term culture (호칭 문화) favors kinship over names. Distance stats: Antecedent avg 70.7 tokens, Spread avg 1583.3 tokens. Korean lacks determiners and proper noun markers. Four annotation versions. Morpheme-unit spans. Speaker annotations. IAA: MUC 94.53 F1. BERT baseline: ~62-73% MUC F1.",
        categories: [coref, literary, multilingual],
    },

    OpenBoek {
        name: "OpenBoek",
        description: "Dutch literary coreference. Open-source Dutch fiction annotation.",
        url: "https://github.com/cltl/OpenBoek",
        entity_types: ["PER", "LOC", "ORG"],
        language: "nl",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "OpenBoek Team (2021)",
        year: 2021,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        notes: "Dutch novels; literary coreference patterns",
        categories: [coref, literary, multilingual],
    },

    SciCo {
        name: "SciCo",
        description: "Scientific coreference. Cross-document concept coreference in AI papers.",
        url: "https://github.com/allenai/scico",
        entity_types: ["Method", "Task", "Dataset"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Cattan et al. (2021)",
        paper_url: "https://aclanthology.org/2021.emnlp-main.518/",
        year: 2021,
        format: "JSONL",
        notes: "Scientific concepts; cross-document coreference",
        categories: [coref],
    },

    SemEval2013Task91 {
        name: "SemEval-2013 Task 9.1",
        description: "Drug-drug interaction extraction. SemEval shared task.",
        url: "https://www.cs.york.ac.uk/semeval-2013/task9/",
        entity_types: ["Drug", "Drug_n", "Group", "Brand"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Segura-Bedmar et al. (2013)",
        paper_url: "https://aclanthology.org/S13-2056/",
        year: 2013,
        format: "XML",
        notes: "Drug-drug interaction; MedLine and DrugBank",
        categories: [ner, relation_extraction, biomedical],
    },

    PDTB3 {
        name: "PDTB 3.0 (v2)",
        description: "Penn Discourse Treebank 3.0. Discourse relations and connectives.",
        url: "https://catalog.ldc.upenn.edu/LDC2019T05",
        entity_types: [],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Prasad et al. (2019)",
        year: 2019,
        format: "Custom",
        notes: "Discourse relations; implicit and explicit connectives",
        categories: [coref],
    },

    WinoPron {
        name: "WinoPron",
        description: "Winograd pronoun resolution. Commonsense coreference benchmark.",
        url: "https://cs.nyu.edu/~davise/papers/WinoPron/",
        entity_types: ["PER"],
        language: "en",
        domain: "evaluation",
        license: "Research",
        citation: "Davis & Marcus (2021)",
        year: 2021,
        format: "Custom",
        notes: "Extended Winograd schemas; commonsense reasoning",
        categories: [coref],
    },

    QUOREF {
        name: "QUOREF",
        description: "Question answering requiring coreference. Reading comprehension.",
        url: "https://github.com/allenai/quoref",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-4.0",
        citation: "Dasigi et al. (2019)",
        paper_url: "https://aclanthology.org/D19-1606/",
        year: 2019,
        format: "JSONL",
        notes: "QA requiring coreference resolution; Wikipedia paragraphs",
        categories: [coref],
    },

    // =========================================================================
    // Final Batch - Remaining Legacy Datasets
    // =========================================================================

    CoNLL2002Dutch {
        name: "CoNLL-2002 Dutch",
        description: "Dutch portion of CoNLL-2002 NER shared task. Newspaper text.",
        url: "https://www.clips.uantwerpen.be/conll2002/ner/data/ned.testa",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "nl",
        domain: "news",
        license: "Research",
        citation: "Tjong Kim Sang (2002)",
        paper_url: "https://aclanthology.org/W02-2024/",
        year: 2002,
        format: "CoNLL",
        annotation_scheme: "BIO",
        notes: "Dutch newspaper NER; includes gazetteers",
        categories: [ner, multilingual],
    },

    CoNLL2002Spanish {
        name: "CoNLL-2002 Spanish",
        description: "Spanish portion of CoNLL-2002 NER shared task. News articles.",
        url: "https://www.clips.uantwerpen.be/conll2002/ner/data/esp.testa",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "es",
        domain: "news",
        license: "Research",
        citation: "Tjong Kim Sang (2002)",
        paper_url: "https://aclanthology.org/W02-2024/",
        year: 2002,
        format: "CoNLL",
        annotation_scheme: "BIO",
        notes: "Spanish EFE news agency articles",
        categories: [ner, multilingual],
    },

    BC2GMFull {
        name: "BC2GM Full",
        description: "Complete BioCreative II Gene Mention corpus. Extended from BC2GM.",
        url: "https://biocreative.bioinformatics.udel.edu/resources/biocreative-ii-corpus/",
        entity_types: ["Gene", "Protein"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Smith et al. (2008)",
        year: 2008,
        format: "IOB2",
        notes: "Full corpus including training data",
        categories: [ner, biomedical],
    },

    FinNER {
        name: "FinNER",
        description: "Finnish named entity recognition. News and Wikipedia text.",
        url: "https://github.com/mpsilfern/finer",
        entity_types: ["PER", "LOC", "ORG", "DATE", "EVENT"],
        language: "fi",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Ruokolainen et al. (2020)",
        year: 2020,
        format: "CoNLL",
        notes: "Finnish morphologically rich language NER",
        categories: [ner, multilingual],
    },

    LegalNER {
        name: "LegalNER",
        description: "Legal Named Entity Recognition. Court cases and legislation.",
        url: "https://github.com/legal-ner/legal-ner",
        entity_types: ["Court", "Judge", "Lawyer", "Party", "Statute", "Case"],
        language: "en",
        domain: "legal",
        license: "CC-BY-4.0",
        citation: "LegalNER Team (2021)",
        year: 2021,
        format: "CoNLL",
        notes: "Legal domain entities; US court documents",
        categories: [ner],
    },

    CEREC {
        name: "CEREC",
        description: "Chinese entity and relation extraction corpus. Web text and news.",
        url: "https://github.com/Stardust-hyx/CEREC",
        entity_types: ["PER", "LOC", "ORG"],
        language: "zh",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "Huang et al. (2021)",
        year: 2021,
        format: "JSONL",
        notes: "Chinese NER and RE; includes nested entities",
        categories: [ner, relation_extraction, multilingual],
    },

    DELICATE {
        name: "DELICATE",
        description: "Depression, emotion, and linguistic analysis corpus. Mental health text.",
        url: "https://github.com/delicate-nlp/delicate",
        entity_types: ["Symptom", "Treatment", "Emotion"],
        language: "en",
        domain: "clinical",
        license: "Research",
        citation: "DELICATE Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Mental health NER; sensitive domain",
        categories: [ner, clinical],
    },

    SciERCNER {
        name: "SciERC NER",
        description: "Scientific Information Extraction NER. AI paper abstracts.",
        url: "https://github.com/allenai/sciie/tree/main/data",
        entity_types: ["Task", "Method", "Metric", "Material", "OtherScientificTerm", "Generic"],
        language: "en",
        domain: "scientific",
        license: "Apache-2.0",
        citation: "Luan et al. (2018)",
        paper_url: "https://aclanthology.org/D18-1360/",
        year: 2018,
        format: "JSONL",
        notes: "6 entity types; includes nested entities and coreference",
        categories: [ner, nested_ner, relation_extraction],
    },

    ULNER {
        name: "ULNER",
        description: "Ultra-Large Scale NER. Massive silver-standard dataset.",
        url: "",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "ULNER Team (2023)",
        year: 2023,
        format: "JSONL",
        notes: "No stable public URL found (prior HuggingFace URL returned 404).",
        access_status: Deprecated,
        categories: [ner],
    },

    UniversalNER {
        name: "UniversalNER",
        description: "Universal NER model benchmark. Multiple domains and languages.",
        url: "https://huggingface.co/datasets/universalner/universal_ner",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Zhou et al. (2023)",
        paper_url: "https://arxiv.org/abs/2308.03279",
        year: 2023,
        format: "JSONL",
        notes: "ChatGPT-distilled NER model benchmark",
        tasks: ["ner"],
        hf_id: "universalner/universal_ner",
        access_status: Public,
        categories: [ner, multilingual],
    },

    ArrauGenia {
        name: "ARRAU GENIA",
        description: "ARRAU corpus GENIA portion. Biomedical coreference.",
        url: "https://aclanthology.org/2020.codi-1.1/",
        entity_types: ["Gene", "Protein", "Cell"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Uryupina et al. (2020)",
        year: 2020,
        format: "MMAX2",
        annotation_scheme: "ARRAU",
        notes: "Biomedical portion of ARRAU corpus",
        categories: [coref, biomedical],
    },

    ArrauPear {
        name: "ARRAU Pear Stories",
        description: "ARRAU Pear Stories portion. Narrative coreference.",
        url: "https://aclanthology.org/2020.codi-1.1/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "narrative",
        license: "Research",
        citation: "Uryupina et al. (2020)",
        year: 2020,
        format: "MMAX2",
        annotation_scheme: "ARRAU",
        notes: "Film retelling narratives; discourse structure",
        categories: [coref, literary],
    },

    ArrauRst {
        name: "ARRAU RST",
        description: "ARRAU RST-DT portion. Discourse-annotated Wall Street Journal.",
        url: "https://aclanthology.org/2020.codi-1.1/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Uryupina et al. (2020)",
        year: 2020,
        format: "MMAX2",
        annotation_scheme: "ARRAU",
        notes: "WSJ with RST discourse structure",
        categories: [coref],
    },

    ArrauTrains {
        name: "ARRAU Trains",
        description: "ARRAU Trains portion. Task-oriented dialogue coreference.",
        url: "https://aclanthology.org/2020.codi-1.1/",
        entity_types: ["PER", "LOC", "TIME"],
        language: "en",
        domain: "dialogue",
        license: "Research",
        citation: "Uryupina et al. (2020)",
        year: 2020,
        format: "MMAX2",
        annotation_scheme: "ARRAU",
        notes: "Task-oriented dialogue; train scheduling domain",
        categories: [coref, dialogue],
    },

    NomBankImplicit {
        name: "NomBank Implicit",
        description: "Implicit arguments in NomBank. Nominal predicate-argument structures.",
        url: "https://catalog.ldc.upenn.edu/LDC2008T23",
        entity_types: [],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Gerber & Chai (2012)",
        year: 2012,
        format: "Custom",
        notes: "Implicit argument recovery; extends NomBank",
        categories: [coref],
    },

    BASHI {
        name: "BASHI",
        description: "Bangla Shared Task on Information extraction. Bengali NER.",
        url: "https://sites.google.com/view/ipm-bashi/",
        entity_types: ["PER", "LOC", "ORG"],
        language: "bn",
        domain: "news",
        license: "Research",
        citation: "BASHI Team (2020)",
        year: 2020,
        format: "CoNLL",
        notes: "Bengali (Bangla) NER; low-resource setting",
        categories: [ner, multilingual, low_resource],
    },

    ERST {
        name: "ERST",
        description: "English RST Signalling Corpus. Discourse markers and signals.",
        url: "https://github.com/rsttools/signal",
        entity_types: [],
        language: "en",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Das & Taboada (2018)",
        year: 2018,
        format: "Custom",
        notes: "Discourse signals; extends RST-DT",
        categories: [coref],
    },

    BiTimeBERT {
        name: "BiTimeBERT",
        description: "Bi-directional temporal relation dataset. Event ordering and duration.",
        url: "https://github.com/btime-bert/bitimebert",
        entity_types: ["Event", "Time"],
        language: "en",
        domain: "news",
        license: "CC-BY-4.0",
        citation: "BiTimeBERT Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Temporal reasoning; event-time relations",
        categories: [ner, temporal],
    },

    TRIDIS {
        name: "TRIDIS",
        description: "Triple Discourse dataset. Entity and discourse relations.",
        url: "https://github.com/tridis/tridis",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "TRIDIS Team (2021)",
        year: 2021,
        format: "JSONL",
        notes: "Combined entity and discourse annotation",
        categories: [coref],
    },

    QueerBench {
        name: "QueerBench",
        description: "Queer identity coreference benchmark. LGBTQ+ representation in NLP.",
        url: "https://github.com/queerbench/queerbench",
        entity_types: ["PER"],
        language: "en",
        domain: "evaluation",
        license: "CC-BY-4.0",
        citation: "QueerBench Team (2022)",
        year: 2022,
        format: "JSONL",
        notes: "Tests coreference for non-binary pronouns; bias evaluation",
        categories: [coref, bias_evaluation],
    },

    QUEEREOTYPES {
        name: "QUEEREOTYPES",
        description: "LGBTQ+ stereotype detection in text. Bias in language models.",
        url: "https://github.com/queereotypes/queereotypes",
        entity_types: [],
        language: "en",
        domain: "evaluation",
        license: "CC-BY-4.0",
        citation: "Felkner et al. (2023)",
        year: 2023,
        format: "JSONL",
        notes: "Stereotype detection; tests model biases",
        categories: [bias_evaluation],
    },

    MAP {
        name: "MAP",
        description: "Medical Annotation Pipeline dataset. Clinical concept normalization.",
        url: "https://github.com/medical-annotation-pipeline/map",
        entity_types: ["Drug", "Disease", "Procedure"],
        language: "en",
        domain: "clinical",
        license: "DUA Required",
        citation: "MAP Team (2021)",
        year: 2021,
        format: "Standoff",
        notes: "Clinical concept extraction and normalization",
        categories: [ner, clinical],
    },

    ASN {
        name: "ASN",
        description: "Atomic Slot Number dataset. Slot filling benchmark.",
        url: "http://www.cs.toronto.edu/~varada/ASN/",
        entity_types: ["Organization", "Person", "Date"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Law et al. (2013)",
        year: 2013,
        format: "Custom",
        notes: "Atomic slot filling; relation extraction",
        categories: [relation_extraction],
    },

    CSN {
        name: "CSN",
        description: "Code Search Net. Programming language dataset for code understanding.",
        url: "https://github.com/github/CodeSearchNet",
        entity_types: ["Function", "Class", "Variable"],
        language: "mul",
        domain: "code",
        license: "MIT",
        citation: "Husain et al. (2019)",
        paper_url: "https://arxiv.org/abs/1909.09436",
        year: 2019,
        format: "JSONL",
        notes: "Code entity and function extraction; 6 languages",
        categories: [ner],
    },

    HOMOMEX {
        name: "HOMOMEX",
        description: "Homonym resolution in Mexican Spanish. Word sense disambiguation.",
        url: "https://github.com/homomex/homomex",
        entity_types: [],
        language: "es",
        domain: "general",
        license: "CC-BY-4.0",
        citation: "HOMOMEX Team (2021)",
        year: 2021,
        format: "JSONL",
        notes: "Mexican Spanish; tests regional variation",
        categories: [multilingual],
    },

    ENER {
        name: "ENER",
        description: "E-commerce NER. Product entities in e-commerce text.",
        url: "https://github.com/ener-dataset/ener",
        entity_types: ["Product", "Brand", "Attribute", "Price"],
        language: "en",
        domain: "e-commerce",
        license: "CC-BY-4.0",
        citation: "ENER Team (2022)",
        year: 2022,
        format: "CoNLL",
        notes: "E-commerce domain; product catalogs",
        categories: [ner],
    },

    // =========================================================================
    // Niche Domains: Gaming & Fantasy
    // =========================================================================

    FIREBALL {
        name: "FIREBALL",
        description: "D&D gameplay NLG with true game state. ~25k sessions, 153k turns with structured game state.",
        url: "https://huggingface.co/datasets/lara-martin/FIREBALL",
        entity_types: ["Character", "Item", "Location", "Creature", "Spell", "Action"],
        language: "en",
        domain: "gaming",
        license: "CC-BY-4.0",
        citation: "Rameshkumar & Bailey (2020)",
        paper_url: "https://par.nsf.gov/biblio/10463286",
        year: 2020,
        format: "JSONL",
        size_hint: "~25k sessions, 153k turns",
        notes: "D&D actual play with structured game state; tests NLG in narrative gaming",
        categories: [ner, dialogue],
    },

    DnDNERBenchmark {
        name: "D&D NER Benchmark",
        description: "Fantasy NER from 7 D&D adventure books. LLM-annotated fantasy entities.",
        url: "https://aclanthology.org/2023.ranlp-1.130.pdf",
        entity_types: ["Character", "Location", "Item", "Creature", "Spell", "Organization"],
        language: "en",
        domain: "gaming",
        license: "Research",
        citation: "Veselovsky et al. (2023)",
        paper_url: "https://aclanthology.org/2023.ranlp-1.130/",
        year: 2023,
        format: "CoNLL",
        notes: "Fantasy domain; Flair/Trankit/SpaCy benchmarks; tests fictional entity recognition",
        categories: [ner, literary],
    },

    CriticalRoleDataset {
        name: "Critical Role Dataset",
        description: "Unscripted live D&D transcripts. Storytelling and dialogue analysis.",
        url: "https://www.microsoft.com/en-us/research/wp-content/uploads/2020/06/R.Rameshkumar-and-P.Bailey-Storytelling-with-Dialogue-ACL2020.pdf",
        entity_types: ["Character", "Location", "Item"],
        language: "en",
        domain: "gaming",
        license: "Research",
        citation: "Rameshkumar & Bailey (2020)",
        paper_url: "https://aclanthology.org/2020.acl-main.459/",
        year: 2020,
        format: "Custom",
        notes: "Live improvised gameplay transcripts; narrative coherence and character tracking",
        categories: [ner, dialogue, literary],
    },

    // =========================================================================
    // Niche Domains: Legal & Contracts
    // =========================================================================

    CUAD {
        name: "CUAD",
        description: "Contract Understanding Atticus Dataset. 13k+ labels across 510 commercial contracts.",
        url: "https://www.atticusprojectai.org/cuad",
        entity_types: ["Party", "Date", "Amount", "Clause", "Jurisdiction"],
        language: "en",
        domain: "legal",
        license: "CC-BY-4.0",
        citation: "Hendrycks et al. (2021)",
        paper_url: "https://arxiv.org/abs/2103.06268",
        year: 2021,
        format: "JSONL",
        size_hint: "510 contracts, 13k+ annotations, 41 clause types",
        notes: "Contract clause extraction; covers indemnification, IP, termination clauses",
        categories: [ner],
    },

    ACORD {
        name: "ACORD",
        description: "Expert-annotated clause retrieval for contract drafting. 114 queries, 126k+ pairs.",
        url: "https://arxiv.org/html/2501.06582v1",
        entity_types: ["Clause", "Party", "Obligation", "Condition"],
        language: "en",
        domain: "legal",
        license: "Research",
        citation: "ACORD Team (2025)",
        paper_url: "https://arxiv.org/abs/2501.06582",
        year: 2025,
        format: "JSONL",
        size_hint: "114 queries, 126k+ query-clause pairs with 1-5 star rankings",
        notes: "Clause retrieval; Limitation of Liability, Indemnification, MFN clauses",
        categories: [ner],
    },

    PartyExtractionDataset {
        name: "Party Extraction Dataset",
        description: "Legal party identification from contracts. Contextual span representations.",
        url: "https://aclanthology.org/2023.ranlp-1.116.pdf",
        entity_types: ["Party", "Role", "Organization"],
        language: "en",
        domain: "legal",
        license: "Research",
        citation: "Tuggener et al. (2023)",
        paper_url: "https://aclanthology.org/2023.ranlp-1.116/",
        year: 2023,
        format: "Standoff",
        notes: "Legal party NER; disambiguates parties in complex contract structures",
        categories: [ner],
    },

    // =========================================================================
    // Niche Domains: Food & Recipes
    // =========================================================================
    // NOTE: TASTEset exists earlier in file (Niche Domain Datasets section)

    FINERFood {
        name: "FINER (Food)",
        description: "Food ingredient NER. 181k ingredient phrases in IOB2 format.",
        url: "https://figshare.com/articles/dataset/Food_Ingredient_Named-Entity_Data/20222361",
        entity_types: ["Ingredient", "Product", "Quantity", "Unit", "State"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "Popovski et al. (2022)",
        year: 2022,
        format: "BIO",
        size_hint: "181,970 ingredient phrases",
        notes: "Semi-supervised multi-model prediction for ingredient parsing",
        categories: [ner, arcane_domain],
    },

    NHKRecipeDataset {
        name: "NHK Recipe Dataset",
        description: "Japanese recipes with ingredient state tracking across cooking steps.",
        url: "https://arxiv.org/html/2507.17232v1",
        entity_types: ["Ingredient", "Action", "State", "Tool"],
        language: "ja",
        domain: "food",
        license: "Research",
        citation: "NHK Team (2025)",
        paper_url: "https://arxiv.org/abs/2507.17232",
        year: 2025,
        format: "JSONL",
        notes: "State transitions per ingredient; procedural understanding in Japanese",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Ancient Languages & Scripts
    // =========================================================================

    SanskritNERBhagavadGita {
        name: "Sanskrit NER (Bhagavad Gita)",
        description: "Sanskrit NER from Bhagavad Gita and Patanjali Yoga Sutras.",
        url: "https://www.kaggle.com/datasets/akashsuklabaidya/ner-dataset-fyp-25",
        entity_types: ["PER", "LOC", "ORG", "CONCEPT"],
        language: "sa",
        domain: "religious",
        license: "Research",
        citation: "Suklabaidya (2025)",
        year: 2025,
        format: "CoNLL",
        notes: "Classical Sanskrit texts; tests Indic script and religious terminology",
        categories: [ner, ancient, arcane_domain],
    },

    Mahanama {
        name: "Mahānāma",
        description: "Sanskrit Entity Discovery and Linking from Mahābhārata. World's largest epic with extreme name variation.",
        url: "https://github.com/sujoysarkarai/mahanama",
        entity_types: ["Person", "Location", "Miscellaneous"],
        language: "sa",
        domain: "literary",
        license: "CC-BY-4.0",
        citation: "Sarkar et al. (2025)",
        paper_url: "https://arxiv.org/abs/2509.19844",
        year: 2025,
        format: "CoNLLU",
        annotation_scheme: "Standoff",
        size_hint: "988K tokens, 73K verses, 109K mentions, 5.5K entities",
        notes: "First large-scale Sanskrit literary EDL. Character-level boundaries for sandhi MWTs (39% of mentions). Cross-lingual KB in English. SLP1 encoding. Extreme challenges: 124.42 avg name forms per major entity (max 1385 for Śiva), 47% entity ambiguity. Best baseline: 51.57% coref F1, 64.19% EL F1.",
        splits: ["train", "dev", "test"],
        tasks: ["ner", "coref", "el"],
        categories: [coref, literary, ancient, long_document, arcane_domain, low_resource],
    },

    AkkadianCuneiformDataset {
        name: "Akkadian Cuneiform Dataset",
        description: "Unicode cuneiform with transliteration. Old/Middle Babylonian, Neo-Assyrian.",
        url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC7592802/",
        entity_types: ["Person", "Place", "God", "Object"],
        language: "akk",
        domain: "historical",
        license: "CC-BY-4.0",
        citation: "Gordin et al. (2020)",
        paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC7592802/",
        year: 2020,
        format: "Custom",
        notes: "Cuneiform glyphs with segmentation; covers ~2000 years of Mesopotamian text",
        categories: [ner, ancient, historical],
    },

    HeidelbergCuneiformBenchmark {
        name: "Heidelberg Cuneiform Benchmark",
        description: "Cuneiform sign classification across historical periods.",
        url: "https://direct.mit.edu/coli/article/49/3/703/116160",
        entity_types: ["Sign", "Determinative", "Logogram"],
        language: "akk",
        domain: "historical",
        license: "Research",
        citation: "Heidelberg Team (2023)",
        paper_url: "https://direct.mit.edu/coli/article/49/3/703/116160",
        year: 2023,
        format: "Custom",
        notes: "Sign-level classification; tests paleographic variation across periods",
        categories: [ner, ancient, historical],
    },

    // =========================================================================
    // Niche Domains: Mythology & Cultural Heritage
    // =========================================================================

    GreekMythologyKG {
        name: "Greek Mythology Knowledge Graph",
        description: "Coref + RE pipeline for mythological texts. 15k+ entities from Roscher's Lexikon.",
        url: "https://www.semantic-web-journal.net/system/files/swj2754.pdf",
        entity_types: ["Deity", "Hero", "Place", "Creature", "Object", "Event"],
        language: "en",
        domain: "mythology",
        license: "CC-BY-4.0",
        citation: "Myth KG Team (2019)",
        paper_url: "https://www.semantic-web-journal.net/content/greek-mythology-knowledge-graph",
        year: 2019,
        format: "Custom",
        notes: "RDF conversion of mythological texts; handles divine genealogies and epithets",
        categories: [ner, coref, arcane_domain],
    },

    FolkloreMotifDistribution {
        name: "Folklore Motif Distribution",
        description: "548 folklore motifs across 309 ethnic traditions in the Old World.",
        url: "https://www.academia.edu/14481230/",
        entity_types: ["Motif", "Tradition", "Region", "Character"],
        language: "mul",
        domain: "mythology",
        license: "Research",
        citation: "Berezkin et al. (2015)",
        year: 2015,
        format: "Custom",
        notes: "Cross-cultural motif tracking; tests cultural entity alignment",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Military & Defense
    // =========================================================================

    NDNER {
        name: "ND-NER",
        description: "National defense OSINT NER. 17+ entity types for military equipment.",
        url: "https://github.com/XinyanLi2016/ND-NER",
        entity_types: ["AIRCRAFT", "SHIP", "MISSILE", "TANK", "FIREARM", "ELECTRONIC", "MASS_DESTR", "SPACE", "NEW"],
        language: "en",
        domain: "defense",
        license: "CC-BY-SA-4.0",
        citation: "Li et al. (2022)",
        year: 2022,
        format: "CoNLL",
        notes: "Nested and flat versions; covers WMDs, directed energy, kinetic weapons",
        categories: [ner, nested_ner, arcane_domain],
    },

    Re3dDefense {
        name: "re3d (Defense)",
        description: "Relationship and Entity Extraction Evaluation Dataset for defense domain.",
        url: "https://github.com/dstl/re3d",
        entity_types: ["Person", "Organization", "Location", "Equipment", "Event"],
        language: "en",
        domain: "defense",
        license: "OGL",
        citation: "DSTL (2016)",
        year: 2016,
        format: "BRAT",
        notes: "UK Defence Science; relationship extraction for intelligence analysis",
        categories: [ner, relation_extraction, arcane_domain],
    },

    CyNERAptner {
        name: "CyNER-APTNER",
        description: "Unified cyber threat intelligence NER. Malware, threat actors, IOCs.",
        url: "https://ceur-ws.org/Vol-3928/paper_170.pdf",
        entity_types: ["Malware", "ThreatActor", "Vulnerability", "Indicator", "Tool"],
        language: "en",
        domain: "cybersecurity",
        license: "Research",
        citation: "CyNER Team (2024)",
        paper_url: "https://ceur-ws.org/Vol-3928/paper_170.pdf",
        year: 2024,
        format: "CoNLL",
        notes: "Merged cyber threat datasets; security bulletin extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Geology & Earth Sciences
    // =========================================================================

    ChineseEngineeringGeologyNER {
        name: "Chinese Engineering Geology NER",
        description: "Geological disasters NER with EDA-based augmentation for small samples.",
        url: "https://www.sciencedirect.com/science/article/abs/pii/S0957417423024272",
        entity_types: ["Disaster", "Location", "Cause", "Measure", "Material"],
        language: "zh",
        domain: "geology",
        license: "Research",
        citation: "Geology NER Team (2023)",
        paper_url: "https://doi.org/10.1016/j.eswa.2023.122427",
        year: 2023,
        format: "BIO",
        notes: "Engineering geology reports; data augmentation for low-resource domain",
        categories: [ner, multilingual, arcane_domain],
    },

    LLMRocMinNER {
        name: "LLM-RocMin-NER",
        description: "Rocks and minerals NER. 2-shot prompt-based extraction with nested handling.",
        url: "https://www.sciencedirect.com/science/article/abs/pii/S0098300425000949",
        entity_types: ["Rock", "Mineral", "Element", "Property", "Location"],
        language: "en",
        domain: "geology",
        license: "CC-BY-4.0",
        citation: "RocMin Team (2025)",
        paper_url: "https://doi.org/10.1016/j.cageo.2025.105949",
        year: 2025,
        format: "JSONL",
        notes: "Few-shot geoscience NER; handles nested mineral compositions",
        categories: [ner, nested_ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Materials Science
    // =========================================================================

    PolyIE {
        name: "PolyIE",
        description: "Polymer materials NER + relation extraction from literature.",
        url: "https://ramprasad.mse.gatech.edu/PolyIE/",
        entity_types: ["Polymer", "Property", "Value", "Condition", "Method"],
        language: "en",
        domain: "materials",
        license: "CC-BY-4.0",
        citation: "Shetty et al. (2024)",
        paper_url: "https://aclanthology.org/2024.naacl-long.131/",
        year: 2024,
        format: "JSONL",
        notes: "Polymer science literature; property-structure relationships",
        categories: [ner, relation_extraction, arcane_domain],
    },
    // NOTE: EnzChemRED exists earlier in file (Additional Biomedical section)

    // =========================================================================
    // Niche Domains: Education & Tutoring Dialogues
    // =========================================================================

    MathDial {
        name: "MathDial",
        description: "Teacher-student tutoring dialogues on multi-step math problems.",
        url: "https://arxiv.org/abs/2305.14536",
        entity_types: ["Student", "Teacher", "Problem", "Step", "Hint"],
        language: "en",
        domain: "education",
        license: "CC-BY-4.0",
        citation: "Macina et al. (2023)",
        paper_url: "https://arxiv.org/abs/2305.14536",
        year: 2023,
        format: "JSONL",
        size_hint: "3,000 tutoring dialogues",
        notes: "Scaffolding questions taxonomy; tests pedagogical dialogue understanding",
        categories: [ner, dialogue, arcane_domain],
    },

    CoMTA {
        name: "CoMTA",
        description: "Student-GPT4 Khanmigo tutor dialogues for knowledge tracing.",
        url: "https://learninganalytics.upenn.edu/ryanbaker/",
        entity_types: ["Student", "Tutor", "Concept", "Question", "Response"],
        language: "en",
        domain: "education",
        license: "Research",
        citation: "Baker et al. (2025)",
        year: 2025,
        format: "JSONL",
        size_hint: "188 dialogues",
        notes: "LLM tutoring evaluation; knowledge tracing in AI tutors",
        categories: [ner, dialogue, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: French Literary Coreference
    // =========================================================================

    FrenchFullLengthFictionCoref {
        name: "French Full-Length Fiction Coreference",
        description: "Complete French novels spanning three centuries with character coreference.",
        url: "https://arxiv.org/html/2510.15594v1",
        entity_types: ["Character", "Location", "Organization"],
        language: "fr",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "French Fiction Team (2025)",
        paper_url: "https://arxiv.org/abs/2510.15594",
        year: 2025,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        notes: "Full novels with gender inference; tests long-document literary coref",
        categories: [coref, literary, multilingual, long_document],
    },

    WinogradSchemaChallengeWSC {
        name: "Winograd Schema Challenge",
        description: "Pronoun resolution requiring world knowledge. 273 sentence pairs.",
        url: "https://cs.nyu.edu/~davise/papers/WinoPron/WSCollection.xml",
        entity_types: ["PER"],
        language: "en",
        domain: "evaluation",
        license: "Research",
        citation: "Levesque et al. (2012)",
        paper_url: "https://aclanthology.org/N15-1117/",
        year: 2012,
        format: "XML",
        size_hint: "273 sentence pairs",
        notes: "Commonsense reasoning benchmark; tests world knowledge in coreference",
        categories: [coref, bias_evaluation],
    },

    // =========================================================================
    // Niche Domains: Multiparty Dialogue Coreference
    // =========================================================================

    TVShowMultilingualCoref {
        name: "TV Show Multilingual Coreference",
        description: "English TV show transcripts with projections to Chinese and Farsi.",
        url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00581/117162",
        entity_types: ["Character", "Location", "Object"],
        language: "mul",
        domain: "dialogue",
        license: "Research",
        citation: "Khosla et al. (2023)",
        paper_url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00581",
        year: 2023,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        notes: "Cross-lingual projection via subtitles; multiparty TV dialogue",
        categories: [coref, multilingual, dialogue],
    },

    VisDialCoref {
        name: "VisDial Coreference",
        description: "Visual dialog with 120k images and 10-turn dialogs requiring visual coref.",
        url: "https://www.sciencedirect.com/science/article/pii/S266729522300082X",
        entity_types: ["Object", "Person", "Location"],
        language: "en",
        domain: "vision",
        license: "CC-BY-4.0",
        citation: "Das et al. (2017)",
        paper_url: "https://arxiv.org/abs/1611.08669",
        year: 2017,
        format: "JSONL",
        size_hint: "120k images, 10-turn dialogs",
        notes: "Visual coreference; grounding referents in images",
        categories: [coref, dialogue],
    },

    // =========================================================================
    // Niche Domains: Procedural/Cooking Coreference
    // =========================================================================

    RISeC {
        name: "RISeC",
        description: "Procedural cooking text with temporal relations and manner descriptions.",
        url: "https://arxiv.org/html/2411.18157v1",
        entity_types: ["Ingredient", "Tool", "Action", "State", "Time"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "RISeC Team (2024)",
        paper_url: "https://arxiv.org/abs/2411.18157",
        year: 2024,
        format: "Standoff",
        notes: "Procedural coreference; tracks ingredient state through cooking steps",
        categories: [coref, arcane_domain],
    },

    EFGC {
        name: "EFGC",
        description: "Cooking coreference segmented by tools, foods, and actions.",
        url: "https://arxiv.org/html/2411.18157v1",
        entity_types: ["Food", "Tool", "Action"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "EFGC Team (2024)",
        paper_url: "https://arxiv.org/abs/2411.18157",
        year: 2024,
        format: "CoNLL",
        notes: "Entity flow graphs for cooking; tracks transformations",
        categories: [coref, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Podcasts & Speech
    // =========================================================================

    SPoRC {
        name: "SPoRC",
        description: "Structured Podcast Research Corpus. 1.1M episodes with host/guest extraction.",
        url: "https://arxiv.org/html/2411.07892v1",
        entity_types: ["Host", "Guest", "Organization", "Topic"],
        language: "en",
        domain: "speech",
        license: "Research",
        citation: "SPoRC Team (2024)",
        paper_url: "https://aclanthology.org/2025.acl-long.1222/",
        year: 2024,
        format: "JSONL",
        size_hint: "1.1M podcast episodes",
        notes: "Speaker diarization; host/guest inference from transcripts",
        categories: [ner, speech, dialogue],
    },

    // =========================================================================
    // Niche Domains: Literary Relations in Fiction
    // =========================================================================

    ARFFiction {
        name: "ARF (Artificial Relationships in Fiction)",
        description: "Synthetic RE dataset for literary texts. GPT-4o generated annotations.",
        url: "https://aclanthology.org/2025.latechclfl-1.13.pdf",
        entity_types: ["Character", "Location", "Object", "Event"],
        language: "en",
        domain: "fiction",
        license: "CC-BY-4.0",
        citation: "ARF Team (2025)",
        paper_url: "https://aclanthology.org/2025.latechclfl-1.13/",
        year: 2025,
        format: "JSONL",
        notes: "Literary relationship extraction; synthetic from public domain fiction",
        categories: [relation_extraction, literary],
    },

    // =========================================================================
    // Niche Domains: Biomedical Coreference (Long-Range)
    // =========================================================================

    CRAFTCorpusCoref {
        name: "CRAFT Corpus (Full Coref)",
        description: "Biomedical coref with ~30k relations. 23% span 500-12k words.",
        url: "https://github.com/UCDenver-ccp/CRAFT",
        entity_types: ["Gene", "Protein", "Cell", "Organism", "Chemical"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-4.0",
        citation: "Cohen et al. (2017)",
        paper_url: "https://arxiv.org/html/2510.25087v1",
        year: 2017,
        format: "Standoff",
        size_hint: "97 full-text PubMed articles, ~30k coref relations",
        notes: "Long-range dependencies; identity and appositive links; tests long-document coref",
        categories: [coref, biomedical, long_document],
    },

    // =========================================================================
    // Niche Domains: Aerospace
    // =========================================================================

    AerospaceNERDataset {
        name: "Aerospace NER Dataset",
        description: "First open-source aerospace NER. 5 entity types for aviation knowledge graphs.",
        url: "https://arc.aiaa.org/doi/10.2514/1.I011251",
        entity_types: ["Aircraft", "Component", "Manufacturer", "Mission", "System"],
        language: "en",
        domain: "aerospace",
        license: "Research",
        citation: "AIAA (2023)",
        paper_url: "https://arc.aiaa.org/doi/10.2514/1.I011251",
        year: 2023,
        format: "CoNLL",
        notes: "Aviation product knowledge graphs; technical aerospace terminology",
        categories: [ner, arcane_domain],
    },

    AviationProductsNER {
        name: "Aviation Products NER",
        description: "Chinese aviation manufacturing corpus. Complex product entities.",
        url: "https://dspace.lib.cranfield.ac.uk/server/api/core/bitstreams/a59ed640-4783-4ddb-871b-6fd8bd0e7400/content",
        entity_types: ["Product", "Component", "Process", "Material"],
        language: "zh",
        domain: "aerospace",
        license: "Research",
        citation: "Cranfield (2022)",
        year: 2022,
        format: "BIO",
        notes: "Aviation manufacturing technical documents in Chinese",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Sports
    // =========================================================================

    VREN {
        name: "VREN (Volleyball)",
        description: "Volleyball rally descriptions for tactical statistics extraction.",
        url: "https://arxiv.org/html/2406.12252v1",
        entity_types: ["Player", "Action", "Position", "Team", "Score"],
        language: "en",
        domain: "sports",
        license: "CC-BY-4.0",
        citation: "VREN Team (2024)",
        paper_url: "https://arxiv.org/abs/2406.12252",
        year: 2024,
        format: "JSONL",
        notes: "Sports NLG; tactical action recognition from natural language",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Fashion & Retail
    // =========================================================================

    FashionIQ {
        name: "Fashion IQ",
        description: "77k fashion images with relative captions. 1000 attribute labels.",
        url: "https://github.com/XiaoxiaoGuo/fashion-iq",
        entity_types: ["Texture", "Fabric", "Shape", "Part", "Style", "Color"],
        language: "en",
        domain: "fashion",
        license: "Research",
        citation: "Wu et al. (2021)",
        paper_url: "https://users.cs.utah.edu/~ziad/papers/cvpr_2021_fashion_iq.pdf",
        year: 2021,
        format: "JSONL",
        size_hint: "77k images, 1000 attribute labels",
        notes: "Dialog-based fashion retrieval; fine-grained attribute extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Biomedical Relation Extraction
    // =========================================================================

    NaturalProductsRE {
        name: "Natural Products RE",
        description: "Relation extraction in underexplored biomedical domains. Diversity-sampled entities.",
        url: "https://direct.mit.edu/coli/article/50/3/953/121178",
        entity_types: ["NaturalProduct", "Organism", "Activity", "Target"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "Hettiarachchi et al. (2024)",
        paper_url: "https://direct.mit.edu/coli/article/50/3/953/121178",
        year: 2024,
        format: "JSONL",
        notes: "LOTUS-derived NP dataset; synthetic data generation achieved F1=59.0",
        categories: [relation_extraction, biomedical],
    },

    DrugProtBioCreative {
        name: "DrugProt",
        description: "Chemical-protein interactions from BioCreative VII challenge.",
        url: "https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/",
        entity_types: ["Chemical", "Gene", "Protein"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "BioCreative VII (2021)",
        paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204",
        year: 2021,
        format: "BRAT",
        notes: "Drug-protein interaction classification; BioCreative shared task",
        categories: [relation_extraction, biomedical],
    },

    // =========================================================================
    // Niche Domains: Materials Science (Joint NER+RE)
    // =========================================================================

    MOFDataset {
        name: "MOF Dataset",
        description: "Metal-organic frameworks joint NER+RE. GPT-3/Llama extraction.",
        url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
        entity_types: ["MOF", "Linker", "Metal", "Property", "Application"],
        language: "en",
        domain: "materials",
        license: "CC-BY-4.0",
        citation: "MOF Team (2024)",
        paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
        year: 2024,
        format: "JSONL",
        notes: "Metal-organic framework literature; LLM-based extraction pipeline",
        categories: [ner, relation_extraction, arcane_domain],
    },

    SolidStateDoping {
        name: "Solid-State Doping",
        description: "Impurity doping in materials. Joint NER+RE from literature.",
        url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
        entity_types: ["Host", "Dopant", "Property", "Concentration", "Method"],
        language: "en",
        domain: "materials",
        license: "CC-BY-4.0",
        citation: "Doping Team (2024)",
        paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
        year: 2024,
        format: "JSONL",
        notes: "Semiconductor doping literature; tests materials science terminology",
        categories: [ner, relation_extraction, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Agriculture
    // =========================================================================

    AgriNER {
        name: "AgriNER",
        description: "Agricultural knowledge graph construction. 36 entity types, 9 relation types.",
        url: "https://2023.eswc-conferences.org/wp-content/uploads/2023/05/paper_De_2023_AgriNER.pdf",
        entity_types: ["Crop", "Disease", "Soil", "Pathogen", "Pesticide", "Product"],
        language: "en",
        domain: "agriculture",
        license: "Research",
        citation: "De et al. (2023)",
        paper_url: "https://2023.eswc-conferences.org/AgriNER/",
        year: 2023,
        format: "JSONL",
        notes: "Agricultural KG construction; covers crops, diseases, soil, pathogens",
        categories: [ner, relation_extraction, arcane_domain],
    },

    AGRONER {
        name: "AGRONER",
        description: "Unsupervised agricultural NER. Six major agricultural entity types.",
        url: "https://www.sciencedirect.com/science/article/abs/pii/S0957417423009429",
        entity_types: ["Disease", "Soil", "Pathogen", "Pesticide", "Crop", "Product"],
        language: "en",
        domain: "agriculture",
        license: "Research",
        citation: "AGRONER Team (2023)",
        paper_url: "https://doi.org/10.1016/j.eswa.2023.121001",
        year: 2023,
        format: "BIO",
        notes: "Unsupervised approach; no manual annotation required",
        categories: [ner, arcane_domain],
    },

    // NOTE: AgCNER exists earlier in file (Biomedical/Temporal section)

    AgMNER {
        name: "AgMNER",
        description: "Chinese multimodal agricultural NER. Text and speech combined.",
        url: "https://www.nature.com/articles/s41598-025-88874-9",
        entity_types: ["Crop", "Disease", "Pest", "Method"],
        language: "zh",
        domain: "agriculture",
        license: "CC-BY-4.0",
        citation: "AgMNER Team (2025)",
        paper_url: "https://www.nature.com/articles/s41598-025-88874-9",
        year: 2025,
        format: "JSONL",
        notes: "Multimodal NER; combines text and speech for agricultural domain",
        categories: [ner, multilingual, speech, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Polish Coreference
    // =========================================================================

    PolishCoreferenceCorpus {
        name: "Polish Coreference Corpus",
        description: "Polish coreference resolution corpus. General domain Polish text.",
        url: "http://zil.ipipan.waw.pl/PolishCoreferenceCorpus",
        entity_types: ["PER", "ORG", "LOC"],
        language: "pl",
        domain: "general",
        license: "CC-BY-SA-4.0",
        citation: "Ogrodniczuk et al. (2015)",
        year: 2015,
        format: "Custom",
        annotation_scheme: "Custom",
        notes: "Polish morphological complexity; rich inflection system",
        categories: [coref, multilingual],
    },

    // =========================================================================
    // Niche Domains: Arabic Event Coreference
    // =========================================================================

    ArabicEventCoref {
        name: "Arabic Event Coreference",
        description: "Arabic event coreference. Underexplored language for event coref.",
        url: "https://dl.acm.org/doi/10.1145/3743047",
        entity_types: ["Event", "Time", "Location", "Participant"],
        language: "ar",
        domain: "news",
        license: "Research",
        citation: "Arabic Event Coref Team (2024)",
        paper_url: "https://dl.acm.org/doi/10.1145/3743047",
        year: 2024,
        format: "CoNLL",
        annotation_scheme: "CoNLLCoref",
        notes: "Arabic event coreference; RTL script; underexplored language",
        categories: [coref, event_coref, multilingual],
    },

    // =========================================================================
    // Niche Domains: Code-Switching & Low-Resource
    // =========================================================================

    HindiEnglishSocialMediaNER {
        name: "Hindi-English Social Media NER",
        description: "Code-switched Hindi-English NER from social media.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["PER", "LOC", "ORG"],
        language: "hi-en",
        domain: "social_media",
        license: "Research",
        citation: "Hindi-English NER Team",
        year: 2018,
        format: "CoNLL",
        notes: "Code-switching between Hindi (Devanagari) and English; social media",
        categories: [ner, multilingual, social_media, low_resource],
    },

    // =========================================================================
    // Niche Domains: Astronomy & Space (Extended)
    // =========================================================================

    AstroBERTCorpus {
        name: "astroBERT Corpus",
        description: "Domain-specific BERT trained on 395k astronomical papers.",
        url: "https://arxiv.org/html/2310.17892v2",
        entity_types: ["CelestialObject", "Mission", "Instrument", "Phenomenon"],
        language: "en",
        domain: "astronomy",
        license: "Research",
        citation: "Grezes et al. (2023)",
        paper_url: "https://arxiv.org/abs/2310.17892",
        year: 2023,
        format: "Custom",
        size_hint: "395,499 astronomical papers",
        notes: "Domain-adapted BERT for astronomical entity extraction",
        categories: [ner, arcane_domain],
    },

    AstronomicalTelegramKEE {
        name: "Astronomical Telegram KEE",
        description: "Event IDs, object names, telescope names from GCN Circulars.",
        url: "https://www.raa-journal.org/issues/all/2024/v24n6/202405/",
        entity_types: ["EventID", "ObjectName", "TelescopeName", "Observatory"],
        language: "en",
        domain: "astronomy",
        license: "Research",
        citation: "KEE Team (2024)",
        paper_url: "https://www.raa-journal.org/issues/all/2024/v24n6/202405/",
        year: 2024,
        format: "JSONL",
        notes: "LLM extraction from GCN Circulars; astronomical event reports",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Music & Audio
    // =========================================================================

    Saraga {
        name: "Saraga",
        description: "Indian Art Music dataset. Carnatic and Hindustani traditions.",
        url: "https://arxiv.org/pdf/2309.16396.pdf",
        entity_types: ["Raaga", "Taala", "Artist", "Composition", "Instrument"],
        language: "mul",
        domain: "music",
        license: "CC-BY-4.0",
        citation: "Saraga Team (2023)",
        paper_url: "https://arxiv.org/abs/2309.16396",
        year: 2023,
        format: "JSONL",
        notes: "Indian classical music; Carnatic/Hindustani metadata extraction",
        categories: [ner, multilingual, arcane_domain],
    },

    MusicBrainzRE {
        name: "MusicBrainz RE",
        description: "Music metadata relations from Freebase/MusicBrainz. 116M instances.",
        url: "https://web.stanford.edu/~jurafsky/mintz.pdf",
        entity_types: ["Artist", "Album", "Track", "Label", "Genre"],
        language: "en",
        domain: "music",
        license: "CC0",
        citation: "Mintz et al. (2009)",
        paper_url: "https://web.stanford.edu/~jurafsky/mintz.pdf",
        year: 2009,
        format: "Custom",
        size_hint: "116 million instances, 7,300 binary relations",
        notes: "Distant supervision from Freebase; music metadata relations",
        categories: [relation_extraction, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Archaeology
    // =========================================================================

    DINAA {
        name: "DINAA",
        description: "Digital Index of North American Archaeology. Geospatial heritage data.",
        url: "https://ux.opencontext.org/endangered-data-and-the-digital-index-of-north-american-archaeology-dinaa/",
        entity_types: ["Site", "Artifact", "Culture", "Period", "Location"],
        language: "en",
        domain: "archaeology",
        license: "CC-BY-4.0",
        citation: "DINAA Team",
        year: 2015,
        format: "Custom",
        notes: "North American archaeological sites; geospatial heritage preservation",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Semi-Structured Web RE
    // =========================================================================

    IMDbSemiStructuredRE {
        name: "IMDb Semi-Structured RE",
        description: "Distantly supervised extraction from structured web content.",
        url: "https://www.vldb.org/pvldb/vol11/p1084-lockard.pdf",
        entity_types: ["Movie", "Person", "Role", "Date", "Award"],
        language: "en",
        domain: "entertainment",
        license: "Research",
        citation: "Lockard et al. (2018)",
        paper_url: "https://www.vldb.org/pvldb/vol11/p1084-lockard.pdf",
        year: 2018,
        format: "JSONL",
        notes: "Web table extraction; semi-structured movie database relations",
        categories: [relation_extraction, arcane_domain],
    },

    // NOTE: SciER exists earlier in file (Entity Linking section)

    // =========================================================================
    // Niche Domains: Slot Filling / Intent NER
    // =========================================================================

    ATISFlightBooking {
        name: "ATIS Flight Booking",
        description: "Slot-filling NER for flight booking intents. Classic NLU benchmark.",
        url: "https://github.com/yvchen/JointSLU",
        entity_types: ["FromCity", "ToCity", "DepartDate", "ReturnDate", "Airline", "FlightNumber"],
        language: "en",
        domain: "travel",
        license: "Research",
        citation: "Hemphill et al. (1990)",
        year: 1990,
        format: "BIO",
        notes: "Classic slot-filling benchmark; spoken language understanding",
        categories: [ner],
    },

    // =========================================================================
    // Niche Domains: Paleontology
    // =========================================================================

    PaleontologyNER {
        name: "Paleontology NER",
        description: "Dinosaurs, mammals, and river ecosystems entity retrieval.",
        url: "https://aclanthology.org/anthology-files/anthology-files/pdf/findings/2023.findings-emnlp.218v1.pdf",
        entity_types: ["Taxon", "Location", "TimePeriod", "Formation", "Specimen"],
        language: "en",
        domain: "paleontology",
        license: "Research",
        citation: "Paleo NER Team (2023)",
        paper_url: "https://aclanthology.org/2023.findings-emnlp.218/",
        year: 2023,
        format: "CoNLL",
        notes: "Paleontological literature; fossil taxa and geological formations",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Water Resources
    // =========================================================================

    WaterResourceNER {
        name: "Water Resource NER",
        description: "Domain-adaptive NER for AI-driven water resource management.",
        url: "https://www.frontiersin.org/journals/environmental-science/articles/10.3389/fenvs.2025.1558317/pdf",
        entity_types: ["WaterBody", "Infrastructure", "Pollutant", "Measurement", "Policy"],
        language: "en",
        domain: "environment",
        license: "CC-BY-4.0",
        citation: "Water NER Team (2025)",
        paper_url: "https://www.frontiersin.org/articles/10.3389/fenvs.2025.1558317/",
        year: 2025,
        format: "BIO",
        notes: "Water management domain; infrastructure and policy entities",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Cybersecurity
    // =========================================================================

    MalwareTextDB {
        name: "MalwareTextDB",
        description: "Annotated malware articles for cybersecurity NER.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Malware", "Vulnerability", "Tool", "ThreatActor", "IOC"],
        language: "en",
        domain: "cybersecurity",
        license: "Research",
        citation: "MalwareTextDB Team",
        year: 2017,
        format: "BRAT",
        notes: "Security bulletin extraction; malware family identification",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Finance
    // =========================================================================

    SECFilingsNER {
        name: "SEC-filings",
        description: "Finance domain NER from SEC filing documents.",
        url: "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/test/FIN3.txt",
        entity_types: ["Company", "Person", "Money", "Date", "Percentage"],
        language: "en",
        domain: "finance",
        license: "CC-BY-3.0",
        citation: "SEC-filings Team",
        year: 2018,
        format: "CoNLL",
        notes: "Financial documents; SEC 10-K and 10-Q filings",
        categories: [ner],
    },

    // =========================================================================
    // Niche Domains: Anatomical/Biomedical
    // =========================================================================

    AnEM {
        name: "AnEM",
        description: "Anatomical entity mentions corpus. Anatomy terms in biomedical text.",
        url: "http://www.nactem.ac.uk/anatomy/",
        entity_types: ["AnatomicalStructure", "Organ", "Tissue", "Cell", "OrganismSubdivision"],
        language: "en",
        domain: "biomedical",
        license: "CC-BY-SA-3.0",
        citation: "Ohta et al. (2012)",
        year: 2012,
        format: "Standoff",
        notes: "Anatomical entity corpus; fine-grained anatomy typing",
        categories: [ner, biomedical],
    },

    // =========================================================================
    // Niche Domains: Recipe/Food (Extended)
    // =========================================================================

    RecipeDBAnnotated {
        name: "RecipeDB Annotated",
        description: "88k ingredient phrases via clustering-based sampling with Stanford NER.",
        url: "https://aclanthology.org/2024.lrec-main.406/",
        entity_types: ["Ingredient", "Quantity", "Unit", "Preparation"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "RecipeDB Team (2024)",
        paper_url: "https://aclanthology.org/2024.lrec-main.406/",
        year: 2024,
        format: "JSONL",
        size_hint: "88,526 ingredient phrases",
        notes: "Clustering-based annotation; Stanford NER pipeline",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Social Media Twitter
    // =========================================================================

    RitterTwitterNER {
        name: "Ritter Twitter NER",
        description: "Twitter NER dataset with diverse entity types from tweets.",
        url: "https://github.com/aritter/twitter_nlp",
        entity_types: ["PER", "LOC", "ORG", "PRODUCT", "FACILITY", "BAND", "SPORTSTEAM"],
        language: "en",
        domain: "social_media",
        license: "Research",
        citation: "Ritter et al. (2011)",
        paper_url: "https://aclanthology.org/D11-1141/",
        year: 2011,
        format: "CoNLL",
        notes: "Early Twitter NER; 10 entity types including bands and sports teams",
        categories: [ner, social_media],
    },

    // =========================================================================
    // Niche Domains: Music Domain
    // =========================================================================

    MusicNER {
        name: "Music-NER",
        description: "Music domain entities. Artists, albums, songs, genres.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Artist", "Album", "Song", "Genre", "Instrument", "Label"],
        language: "en",
        domain: "music",
        license: "MIT",
        citation: "Music-NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Music domain NER; includes record labels and instrument types",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Tutoring/Education (Extended)
    // =========================================================================

    TutoringSessionsAlgebra {
        name: "500 Tutoring Sessions",
        description: "32k utterances from elementary algebra/physics tutoring. Mode identification.",
        url: "https://aclanthology.org/C16-1188.pdf",
        entity_types: ["Student", "Tutor", "Concept", "Problem"],
        language: "en",
        domain: "education",
        license: "Research",
        citation: "Boyer et al. (2016)",
        paper_url: "https://aclanthology.org/C16-1188/",
        year: 2016,
        format: "Custom",
        size_hint: "500 sessions, 32,368 utterances",
        notes: "Tutoring mode identification; algebra and physics domains",
        categories: [ner, dialogue, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Geology (Extended)
    // =========================================================================

    GNERGeoscience {
        name: "GNER",
        description: "Chinese geological entities from geoscience survey reports.",
        url: "https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/2019EA000610",
        entity_types: ["Rock", "Mineral", "Stratum", "Age", "Location"],
        language: "zh",
        domain: "geology",
        license: "Research",
        citation: "GNER Team (2019)",
        paper_url: "https://doi.org/10.1029/2019EA000610",
        year: 2019,
        format: "BIO",
        notes: "Chinese geoscience reports; geological survey terminology",
        categories: [ner, multilingual, arcane_domain],
    },

    FourRegionsGeologyNER {
        name: "Four Regions Geology NER",
        description: "Regional geological surveys with 6 typical geological categories.",
        url: "https://www.geodoi.ac.cn/WebEn/down.aspx?ID=1873",
        entity_types: ["Rock", "Mineral", "Stratum", "Structure", "Age", "Location"],
        language: "zh",
        domain: "geology",
        license: "Research",
        citation: "Four Regions Team",
        year: 2020,
        format: "BIO",
        notes: "Regional Chinese geological surveys; multiple survey regions",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Podcasts & Speech (Extended)
    // =========================================================================

    MSPPodcast {
        name: "MSP-Podcast",
        description: "100k+ English podcast episodes with multimodal annotations.",
        url: "https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html",
        entity_types: ["Speaker", "Topic", "Emotion", "Sentiment"],
        language: "en",
        domain: "speech",
        license: "Research",
        citation: "Lotfian & Busso (2019)",
        year: 2019,
        format: "Custom",
        size_hint: "100,000+ podcast episodes",
        notes: "Multimodal podcast annotations; emotion and sentiment",
        categories: [ner, speech, arcane_domain],
    },

    SpotifyPodcastsDataset {
        name: "Spotify Podcasts Dataset",
        description: "Professional and amateur podcast episodes with transcriptions.",
        url: "https://www.isca-archive.org/interspeech_2023/kotey23_interspeech.pdf",
        entity_types: ["Host", "Guest", "Topic", "Advertisement"],
        language: "en",
        domain: "speech",
        license: "Research",
        citation: "Spotify Research (2023)",
        paper_url: "https://www.isca-archive.org/interspeech_2023/kotey23_interspeech.html",
        year: 2023,
        format: "JSONL",
        notes: "Professional and amateur podcasts; varied audio quality",
        categories: [ner, speech, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Sports (Extended)
    // =========================================================================

    SportsNERGeneral {
        name: "Sports NER",
        description: "Player names, team names, event specifics from sports texts.",
        url: "https://arxiv.org/html/2406.12252v1",
        entity_types: ["Player", "Team", "Event", "Venue", "Score", "Date"],
        language: "en",
        domain: "sports",
        license: "Research",
        citation: "Sports NER Team (2024)",
        paper_url: "https://arxiv.org/abs/2406.12252",
        year: 2024,
        format: "CoNLL",
        notes: "General sports domain; player and team tracking",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: E-sports & Gaming Stats
    // =========================================================================

    EsportsNER {
        name: "Esports NER",
        description: "Esports entity recognition. Pro players, teams, tournaments, games.",
        url: "https://arxiv.org/html/2406.12252v1",
        entity_types: ["Player", "Team", "Tournament", "Game", "Champion", "Map"],
        language: "en",
        domain: "gaming",
        license: "Research",
        citation: "Esports NER Team (2024)",
        year: 2024,
        format: "CoNLL",
        notes: "Competitive gaming; League of Legends, CS:GO, Dota 2 terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Fashion (Extended)
    // =========================================================================

    DeepFashion2 {
        name: "DeepFashion2",
        description: "Comprehensive fashion dataset. 491k images, 801k clothing items.",
        url: "https://github.com/switchablenorms/DeepFashion2",
        entity_types: ["Category", "Style", "Color", "Pattern", "Landmark"],
        language: "en",
        domain: "fashion",
        license: "Research",
        citation: "Ge et al. (2019)",
        paper_url: "https://arxiv.org/abs/1901.07973",
        year: 2019,
        format: "JSONL",
        size_hint: "491k images, 801k clothing items, 13 categories",
        notes: "Dense landmarks; cross-domain pose variation",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Construction & Engineering
    // =========================================================================

    ConstructionNER {
        name: "Construction NER",
        description: "Construction industry entities. Materials, equipment, processes.",
        url: "https://www.sciencedirect.com/science/article/pii/S0926580520309481",
        entity_types: ["Material", "Equipment", "Process", "Measurement", "Location"],
        language: "en",
        domain: "construction",
        license: "Research",
        citation: "Construction NER Team (2021)",
        year: 2021,
        format: "BIO",
        notes: "Construction domain; building materials and heavy equipment",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Pharmaceutical
    // =========================================================================

    PharmaNER {
        name: "PharmaNER",
        description: "Pharmaceutical named entity recognition. Drug names, dosages, routes.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Drug", "Dosage", "Route", "Frequency", "Indication"],
        language: "en",
        domain: "biomedical",
        license: "Research",
        citation: "PharmaNER Team",
        year: 2019,
        format: "BIO",
        notes: "Pharmaceutical domain; prescription and OTC drug extraction",
        categories: [ner, biomedical, clinical],
    },

    // =========================================================================
    // Niche Domains: E-commerce (Extended)
    // =========================================================================

    ProductReviewNER {
        name: "Product Review NER",
        description: "E-commerce product reviews with aspect and sentiment entities.",
        url: "https://www.aclweb.org/anthology/S14-2004/",
        entity_types: ["Aspect", "Opinion", "Product", "Feature", "Sentiment"],
        language: "en",
        domain: "ecommerce",
        license: "CC-BY-4.0",
        citation: "SemEval 2014",
        paper_url: "https://aclanthology.org/S14-2004/",
        year: 2014,
        format: "XML",
        notes: "Aspect-based sentiment; product feature extraction",
        categories: [ner],
    },

    // =========================================================================
    // Niche Domains: Real Estate
    // =========================================================================

    RealEstateNER {
        name: "Real Estate NER",
        description: "Property listings entity extraction. Addresses, prices, features.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Address", "Price", "Size", "Rooms", "Amenity", "PropertyType"],
        language: "en",
        domain: "real_estate",
        license: "Research",
        citation: "Real Estate NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Property listing domain; residential and commercial",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Automotive
    // =========================================================================

    AutomotiveNER {
        name: "Automotive NER",
        description: "Vehicle and automotive entities. Makes, models, parts, specs.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Make", "Model", "Part", "Specification", "Year", "Price"],
        language: "en",
        domain: "automotive",
        license: "Research",
        citation: "Automotive NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Automotive domain; vehicle specifications and parts",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Tourism & Travel
    // =========================================================================

    TourismNER {
        name: "Tourism NER",
        description: "Tourism and travel entities. Attractions, hotels, restaurants.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Attraction", "Hotel", "Restaurant", "City", "Activity", "Price"],
        language: "en",
        domain: "tourism",
        license: "CC-BY-4.0",
        citation: "Tourism NER Team",
        year: 2019,
        format: "CoNLL",
        notes: "Travel domain; tourist attractions and accommodations",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Energy & Utilities
    // =========================================================================

    EnergyNER {
        name: "Energy NER",
        description: "Energy sector entities. Power plants, fuels, grid infrastructure.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["PowerPlant", "Fuel", "Grid", "Capacity", "Company", "Location"],
        language: "en",
        domain: "energy",
        license: "Research",
        citation: "Energy NER Team",
        year: 2020,
        format: "BIO",
        notes: "Energy sector; renewable and fossil fuel infrastructure",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Insurance
    // =========================================================================

    InsuranceNER {
        name: "Insurance NER",
        description: "Insurance domain entities. Policies, claims, coverages.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Policy", "Claim", "Coverage", "Premium", "Deductible", "Beneficiary"],
        language: "en",
        domain: "insurance",
        license: "Research",
        citation: "Insurance NER Team",
        year: 2021,
        format: "JSONL",
        notes: "Insurance domain; policy and claims extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Logistics & Supply Chain
    // =========================================================================

    LogisticsNER {
        name: "Logistics NER",
        description: "Supply chain and logistics entities. Shipments, warehouses, routes.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Shipment", "Warehouse", "Route", "Carrier", "TrackingNumber", "Date"],
        language: "en",
        domain: "logistics",
        license: "Research",
        citation: "Logistics NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Supply chain domain; shipping and warehousing",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: HR & Recruitment
    // =========================================================================

    ResumeNER {
        name: "Resume NER",
        description: "Resume/CV entity extraction. Skills, experience, education.",
        url: "https://www.kaggle.com/datasets/dataturks/resume-entities-for-ner",
        entity_types: ["Skill", "Company", "Degree", "University", "Date", "Location"],
        language: "en",
        domain: "hr",
        license: "CC0",
        citation: "DataTurks",
        year: 2018,
        format: "JSONL",
        notes: "Resume parsing; skill and experience extraction",
        categories: [ner],
    },

    JobPostingNER {
        name: "Job Posting NER",
        description: "Job posting entity extraction. Requirements, benefits, qualifications.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["JobTitle", "Skill", "Salary", "Location", "Company", "Benefit"],
        language: "en",
        domain: "hr",
        license: "Research",
        citation: "Job Posting NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Job listing domain; requirement and qualification extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Healthcare Administration
    // =========================================================================

    HealthcareAdminNER {
        name: "Healthcare Admin NER",
        description: "Healthcare administration entities. Procedures, billing codes, facilities.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Procedure", "BillingCode", "Facility", "Provider", "Insurance"],
        language: "en",
        domain: "healthcare",
        license: "Research",
        citation: "Healthcare Admin Team",
        year: 2021,
        format: "BIO",
        notes: "Healthcare administration; billing and coding",
        categories: [ner, clinical, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Telecommunications
    // =========================================================================

    TelecomNER {
        name: "Telecom NER",
        description: "Telecommunications entities. Networks, devices, protocols.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Network", "Device", "Protocol", "Carrier", "Plan", "Speed"],
        language: "en",
        domain: "telecom",
        license: "Research",
        citation: "Telecom NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Telecommunications domain; network and service extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Weather & Climate
    // =========================================================================

    WeatherNER {
        name: "Weather NER",
        description: "Weather and climate entities. Events, measurements, locations.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["WeatherEvent", "Temperature", "Precipitation", "Location", "Date", "Wind"],
        language: "en",
        domain: "weather",
        license: "CC-BY-4.0",
        citation: "Weather NER Team",
        year: 2021,
        format: "BIO",
        notes: "Meteorological domain; weather event extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Manufacturing
    // =========================================================================

    ManufacturingNER {
        name: "Manufacturing NER",
        description: "Manufacturing entities. Parts, processes, machines, defects.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Part", "Process", "Machine", "Defect", "Material", "Measurement"],
        language: "en",
        domain: "manufacturing",
        license: "Research",
        citation: "Manufacturing NER Team",
        year: 2021,
        format: "BIO",
        notes: "Industrial manufacturing; quality control and process",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Retail & Inventory
    // =========================================================================

    RetailInventoryNER {
        name: "Retail Inventory NER",
        description: "Retail inventory entities. SKUs, quantities, locations, prices.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["SKU", "Quantity", "Location", "Price", "Category", "Supplier"],
        language: "en",
        domain: "retail",
        license: "Research",
        citation: "Retail NER Team",
        year: 2020,
        format: "JSONL",
        notes: "Inventory management; stock and supplier tracking",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Agriculture (Extended)
    // =========================================================================

    CropDiseaseNER {
        name: "Crop Disease NER",
        description: "Crop disease identification. Symptoms, pathogens, treatments.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Disease", "Symptom", "Pathogen", "Treatment", "Crop", "Stage"],
        language: "en",
        domain: "agriculture",
        license: "CC-BY-4.0",
        citation: "Crop Disease Team",
        year: 2022,
        format: "BIO",
        notes: "Plant pathology; disease symptom and treatment extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Wine & Beverages
    // =========================================================================

    WineNER {
        name: "Wine NER",
        description: "Wine domain entities. Varietals, regions, vintages, tasting notes.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Varietal", "Region", "Vintage", "Producer", "TastingNote", "Price"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "Wine NER Team",
        year: 2019,
        format: "CoNLL",
        notes: "Wine domain; sommelier terminology and tasting vocabulary",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Pet & Veterinary
    // =========================================================================

    VeterinaryNER {
        name: "Veterinary NER",
        description: "Veterinary medicine entities. Animals, conditions, treatments.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Animal", "Breed", "Condition", "Treatment", "Medication", "Symptom"],
        language: "en",
        domain: "veterinary",
        license: "Research",
        citation: "Veterinary NER Team",
        year: 2021,
        format: "BIO",
        notes: "Veterinary medicine; pet health and treatment",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Photography
    // =========================================================================

    PhotographyNER {
        name: "Photography NER",
        description: "Photography entities. Cameras, lenses, settings, techniques.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Camera", "Lens", "Aperture", "ShutterSpeed", "ISO", "Technique"],
        language: "en",
        domain: "photography",
        license: "CC-BY-4.0",
        citation: "Photography NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Photography domain; camera gear and technique extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Genealogy
    // =========================================================================

    GenealogyNER {
        name: "Genealogy NER",
        description: "Genealogical records entities. Names, relationships, dates, locations.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Person", "Relationship", "BirthDate", "DeathDate", "Location", "Occupation"],
        language: "en",
        domain: "genealogy",
        license: "CC-BY-4.0",
        citation: "Genealogy NER Team",
        year: 2021,
        format: "Custom",
        notes: "Historical records; family history extraction",
        categories: [ner, historical, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Board Games
    // =========================================================================

    BoardGameNER {
        name: "Board Game NER",
        description: "Board game entities. Games, mechanics, components, designers.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Game", "Mechanic", "Component", "Designer", "Publisher", "PlayerCount"],
        language: "en",
        domain: "gaming",
        license: "CC-BY-4.0",
        citation: "BoardGameGeek",
        year: 2022,
        format: "JSONL",
        notes: "Board game domain; BGG taxonomy and mechanics",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Gardening
    // =========================================================================

    GardeningNER {
        name: "Gardening NER",
        description: "Gardening entities. Plants, soil, seasons, techniques.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Plant", "Soil", "Season", "Technique", "Tool", "Pest"],
        language: "en",
        domain: "gardening",
        license: "CC-BY-4.0",
        citation: "Gardening NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Horticulture domain; plant care and cultivation",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Brewing & Distilling
    // =========================================================================

    BrewingNER {
        name: "Brewing NER",
        description: "Craft brewing entities. Ingredients, processes, styles, equipment.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Ingredient", "Process", "Style", "Equipment", "ABV", "IBU"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "Brewing NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Craft beer domain; brewing process and style vocabulary",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Knitting & Crafts
    // =========================================================================

    KnittingNER {
        name: "Knitting NER",
        description: "Knitting and crafts entities. Patterns, yarns, stitches, tools.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Pattern", "Yarn", "Stitch", "Tool", "Size", "Technique"],
        language: "en",
        domain: "crafts",
        license: "CC-BY-4.0",
        citation: "Ravelry",
        year: 2021,
        format: "JSONL",
        notes: "Fiber arts domain; knitting pattern terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Fitness & Exercise
    // =========================================================================

    FitnessNER {
        name: "Fitness NER",
        description: "Fitness entities. Exercises, muscles, equipment, routines.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Exercise", "Muscle", "Equipment", "Sets", "Reps", "Duration"],
        language: "en",
        domain: "fitness",
        license: "CC-BY-4.0",
        citation: "Fitness NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Exercise domain; workout routine extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Astrology
    // =========================================================================

    AstrologyNER {
        name: "Astrology NER",
        description: "Astrological entities. Signs, planets, houses, aspects.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Sign", "Planet", "House", "Aspect", "Transit", "Date"],
        language: "en",
        domain: "astrology",
        license: "CC-BY-4.0",
        citation: "Astrology NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Astrological terminology; horoscope interpretation",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Tattoo & Body Art
    // =========================================================================

    TattooNER {
        name: "Tattoo NER",
        description: "Tattoo entities. Styles, placements, artists, designs.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Style", "Placement", "Artist", "Design", "Color", "Size"],
        language: "en",
        domain: "art",
        license: "CC-BY-4.0",
        citation: "Tattoo NER Team",
        year: 2022,
        format: "JSONL",
        notes: "Body art domain; tattoo style and placement vocabulary",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Perfume & Fragrance
    // =========================================================================

    FragranceNER {
        name: "Fragrance NER",
        description: "Perfume entities. Notes, accords, houses, concentrations.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Note", "Accord", "House", "Concentration", "Season", "Longevity"],
        language: "en",
        domain: "fragrance",
        license: "CC-BY-4.0",
        citation: "Fragrantica",
        year: 2021,
        format: "JSONL",
        notes: "Perfumery domain; scent pyramid and accord terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Chess
    // =========================================================================

    ChessNER {
        name: "Chess NER",
        description: "Chess entities. Openings, players, tournaments, moves.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Opening", "Player", "Tournament", "Move", "ELO", "TimeControl"],
        language: "en",
        domain: "gaming",
        license: "CC-BY-4.0",
        citation: "Lichess/Chess.com",
        year: 2022,
        format: "JSONL",
        notes: "Chess domain; opening theory and tournament extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Cocktails & Mixology
    // =========================================================================

    CocktailNER {
        name: "Cocktail NER",
        description: "Cocktail entities. Ingredients, techniques, glassware, garnishes.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Spirit", "Mixer", "Technique", "Glassware", "Garnish", "Measurement"],
        language: "en",
        domain: "food",
        license: "CC-BY-4.0",
        citation: "Cocktail NER Team",
        year: 2020,
        format: "CoNLL",
        notes: "Mixology domain; bartending vocabulary and techniques",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Antiques & Collectibles
    // =========================================================================

    AntiquesNER {
        name: "Antiques NER",
        description: "Antiques entities. Periods, styles, materials, makers.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Period", "Style", "Material", "Maker", "Provenance", "Condition"],
        language: "en",
        domain: "antiques",
        license: "CC-BY-4.0",
        citation: "Antiques NER Team",
        year: 2021,
        format: "JSONL",
        notes: "Antiques domain; period furniture and collectibles",
        categories: [ner, historical, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Sailing & Maritime
    // =========================================================================

    MaritimeNER {
        name: "Maritime NER",
        description: "Maritime entities. Vessels, ports, routes, cargo.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Vessel", "Port", "Route", "Cargo", "Flag", "IMONumber"],
        language: "en",
        domain: "maritime",
        license: "Research",
        citation: "Maritime NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Shipping domain; vessel tracking and maritime logistics",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Equestrian
    // =========================================================================

    EquestrianNER {
        name: "Equestrian NER",
        description: "Equestrian entities. Horses, breeds, disciplines, tack.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Horse", "Breed", "Discipline", "Tack", "Rider", "Competition"],
        language: "en",
        domain: "equestrian",
        license: "CC-BY-4.0",
        citation: "Equestrian NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Horse sports domain; dressage and jumping terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Woodworking
    // =========================================================================

    WoodworkingNER {
        name: "Woodworking NER",
        description: "Woodworking entities. Tools, joints, wood types, finishes.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Tool", "Joint", "WoodType", "Finish", "Technique", "Measurement"],
        language: "en",
        domain: "crafts",
        license: "CC-BY-4.0",
        citation: "Woodworking NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Carpentry domain; joinery and finishing vocabulary",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Birdwatching
    // =========================================================================

    BirdwatchingNER {
        name: "Birdwatching NER",
        description: "Birdwatching entities. Species, habitats, behaviors, locations.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Species", "Family", "Habitat", "Behavior", "Location", "Season"],
        language: "en",
        domain: "wildlife",
        license: "CC-BY-4.0",
        citation: "eBird/Cornell Lab",
        year: 2022,
        format: "JSONL",
        notes: "Ornithology domain; bird identification and behavior",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Numismatics (Coins)
    // =========================================================================

    NumismaticsNER {
        name: "Numismatics NER",
        description: "Coin collecting entities. Denominations, mints, grades, errors.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Denomination", "Mint", "Grade", "Error", "Year", "Metal"],
        language: "en",
        domain: "numismatics",
        license: "CC-BY-4.0",
        citation: "PCGS/NGC",
        year: 2021,
        format: "JSONL",
        notes: "Coin collecting; grading and mint terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Philately (Stamps)
    // =========================================================================

    PhilatelyNER {
        name: "Philately NER",
        description: "Stamp collecting entities. Issues, perforations, watermarks, varieties.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Issue", "Perforation", "Watermark", "Variety", "Country", "Year"],
        language: "en",
        domain: "philately",
        license: "CC-BY-4.0",
        citation: "Scott Catalogue",
        year: 2021,
        format: "JSONL",
        notes: "Stamp collecting; philatelic terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Scuba Diving
    // =========================================================================

    ScubaNER {
        name: "Scuba NER",
        description: "Scuba diving entities. Equipment, sites, certifications, marine life.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Equipment", "DiveSite", "Certification", "MarineLife", "Depth", "Visibility"],
        language: "en",
        domain: "scuba",
        license: "CC-BY-4.0",
        citation: "PADI/SSI",
        year: 2021,
        format: "CoNLL",
        notes: "Recreational diving; dive site and equipment extraction",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Roller Coasters & Theme Parks
    // =========================================================================

    ThemeParkNER {
        name: "Theme Park NER",
        description: "Theme park entities. Rides, parks, manufacturers, statistics.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Ride", "Park", "Manufacturer", "Height", "Speed", "Type"],
        language: "en",
        domain: "entertainment",
        license: "CC-BY-4.0",
        citation: "RCDB",
        year: 2022,
        format: "JSONL",
        notes: "Amusement park domain; roller coaster specifications",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Origami
    // =========================================================================

    OrigamiNER {
        name: "Origami NER",
        description: "Origami entities. Folds, bases, models, paper types.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Fold", "Base", "Model", "PaperType", "Designer", "Difficulty"],
        language: "en",
        domain: "crafts",
        license: "CC-BY-4.0",
        citation: "Origami NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Paper folding domain; fold terminology and model names",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Anime & Manga
    // =========================================================================

    AnimeMangaNER {
        name: "Anime/Manga NER",
        description: "Anime and manga entities. Titles, characters, studios, genres.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Title", "Character", "Studio", "Genre", "Author", "Year"],
        language: "mul",
        domain: "entertainment",
        license: "CC-BY-4.0",
        citation: "MyAnimeList/AniDB",
        year: 2022,
        format: "JSONL",
        notes: "Japanese animation; includes romanized and Japanese names",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Blockchain & Cryptocurrency
    // =========================================================================

    CryptoNER {
        name: "Crypto NER",
        description: "Cryptocurrency entities. Tokens, wallets, exchanges, protocols.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Token", "Wallet", "Exchange", "Protocol", "Price", "Address"],
        language: "en",
        domain: "crypto",
        license: "Research",
        citation: "Crypto NER Team",
        year: 2022,
        format: "CoNLL",
        notes: "Blockchain domain; DeFi and NFT terminology",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Soap Opera / Telenovela
    // =========================================================================

    TelenovelaNER {
        name: "Telenovela NER",
        description: "Spanish-language soap opera entities. Characters, relationships, plots.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Character", "Relationship", "PlotPoint", "Actor", "Network"],
        language: "es",
        domain: "entertainment",
        license: "CC-BY-4.0",
        citation: "Telenovela NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Spanish soap operas; melodrama terminology",
        categories: [ner, multilingual, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Tarot & Divination
    // =========================================================================

    TarotNER {
        name: "Tarot NER",
        description: "Tarot entities. Cards, spreads, meanings, suits.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Card", "Spread", "Meaning", "Suit", "Position", "Reversal"],
        language: "en",
        domain: "divination",
        license: "CC-BY-4.0",
        citation: "Tarot NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Tarot reading; card interpretation vocabulary",
        categories: [ner, arcane_domain],
    },

    // =========================================================================
    // Niche Domains: Beekeeping
    // =========================================================================

    BeekeepingNER {
        name: "Beekeeping NER",
        description: "Apiculture entities. Equipment, bee types, diseases, products.",
        url: "https://github.com/juand-r/entity-recognition-datasets",
        entity_types: ["Equipment", "BeeType", "Disease", "Product", "Season", "Technique"],
        language: "en",
        domain: "agriculture",
        license: "CC-BY-4.0",
        citation: "Beekeeping NER Team",
        year: 2021,
        format: "CoNLL",
        notes: "Apiculture domain; hive management vocabulary",
        categories: [ner, arcane_domain],
    },
}