anno-eval 0.10.0

    // =========================================================================
    // Core NER Datasets
    // =========================================================================
    WikiGold {
        name: "WikiGold",
        description: "Wikipedia-based NER (PER, LOC, ORG, MISC). Historically significant as early Wikipedia NER resource.",
        url: "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/wikigold.conll.txt",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-4.0",
        citation: "Balasuriya et al. (2009)",
        paper_url: "https://aclanthology.org/U09-1001/",
        year: 2009,
        format: "CoNLL",
        annotation_scheme: "IOB2",
        size_hint: "~40k tokens, ~3,500 entities",
        example: "Japan B-LOC\n's O\nMinister O\nShinzo B-PER\nAbe I-PER\nvisited O\nthe O\nUnited B-LOC\nStates I-LOC\n. O",
        splits: ["all"],
        tasks: ["ner"],
        expected_docs: 145,
        categories: [ner],
    },
    Wnut17 {
        name: "WNUT-17",
        description: "Social media NER with emerging entities. Created to evaluate models on rare/emerging entities in noisy social text.",
        url: "https://raw.githubusercontent.com/leondz/emerging_entities_17/master/emerging.test.annotated",
        entity_types: ["person", "location", "corporation", "product", "creative-work", "group"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Derczynski et al. (2017)",
        paper_url: "https://aclanthology.org/W17-4418/",
        year: 2017,
        format: "CoNLL",
        annotation_scheme: "BIO",
        size_hint: "~65k tokens, 1,000 tweets",
        notes: "89% unseen entities in test set - excellent for OOD evaluation; shared task at W-NUT workshop",
        tasks: ["ner"],
        hf_id: "leondz/wnut_17",
        categories: [ner, social_media],
    },
    MitMovie {
        name: "MIT Movie",
        description: "Movie domain slot filling NER. Created at MIT SLS for spoken language understanding research.",
        url: "https://sls.csail.mit.edu/downloads/movie/engtest.bio",
        entity_types: ["Actor", "Director", "Genre", "Title", "Year", "Song", "Character", "Plot", "Rating"],
        language: "en",
        domain: "entertainment",
        license: "Research",
        citation: "Liu et al. (2013)",
        paper_url: "https://sls.csail.mit.edu/publications/2013/Liu_ASRU_2013.pdf",
        year: 2013,
        format: "BIO",
        annotation_scheme: "BIO",
        size_hint: "~12k utterances",
        example: "show O\nme O\naction B-Genre\nmovies O\ndirected O\nby O\nsteven B-Director\nspielberg I-Director",
        tasks: ["ner", "slot_filling"],
        categories: [ner],
    },
    MitRestaurant {
        name: "MIT Restaurant",
        description: "Restaurant domain slot filling NER. Part of MIT SLS spoken dialogue systems research.",
        url: "https://sls.csail.mit.edu/downloads/restaurant/restauranttest.bio",
        entity_types: ["Amenity", "Cuisine", "Dish", "Hours", "Location", "Price", "Rating", "Restaurant_Name"],
        language: "en",
        domain: "restaurant",
        license: "Research",
        citation: "Liu et al. (2013)",
        paper_url: "https://sls.csail.mit.edu/publications/2013/Liu_ASRU_2013.pdf",
        year: 2013,
        format: "BIO",
        annotation_scheme: "BIO",
        size_hint: "~8k utterances",
        example: "find O\nitalian B-Cuisine\nrestaurants O\nin O\nboston B-Location\nwith O\noutdoor B-Amenity\nseating I-Amenity",
        tasks: ["ner", "slot_filling"],
        categories: [ner],
    },
    CoNLL2003Sample {
        name: "CoNLL-2003 Sample",
        description: "Classic news NER benchmark from Reuters Corpus. Foundational dataset that established modern NER evaluation standards.",
        url: "https://raw.githubusercontent.com/autoih/conll2003/master/CoNLL-2003/eng.testb",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Tjong Kim Sang & De Meulder (2003)",
        paper_url: "https://aclanthology.org/W03-0419/",
        year: 2003,
        format: "CoNLL",
        annotation_scheme: "IOB2",
        size_hint: "~300k tokens, ~35k entities",
        example: "EU B-ORG\nrejects O\nGerman B-MISC\ncall O\nto O\nboycott O\nBritish B-MISC\nlamb O\n. O",
        notes: "Known annotation noise; see CleanCoNLL (2023) for one audit/correction pass",
        tasks: ["ner"],
        categories: [ner],
    },
    OntoNotesSample {
        name: "OntoNotes Sample",
        description: "Multi-genre 18-type NER from OntoNotes 5.0. Rich annotation including coreference, parsing, and PropBank.",
        url: "https://raw.githubusercontent.com/autoih/conll2003/master/CoNLL-2003/eng.testb",
        entity_types: ["PERSON", "ORG", "GPE", "LOC", "DATE", "TIME", "MONEY", "PERCENT", "NORP", "FAC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "QUANTITY", "ORDINAL", "CARDINAL"],
        language: "en",
        domain: "news",
        license: "LDC",
        citation: "Weischedel et al. (2013)",
        paper_url: "https://catalog.ldc.upenn.edu/LDC2013T19",
        year: 2013,
        format: "CoNLL",
        annotation_scheme: "IOB2",
        size_hint: "~1.6M tokens, ~128k entities",
        example: "The B-ORG\nEuropean I-ORG\nUnion I-ORG\nannounced O\nMonday B-DATE\nthat O\nthe O\n$ B-MONEY\n10 I-MONEY\nmillion I-MONEY\nwill O\ngo O\nto O\nUkraine B-GPE\n. O",
        notes: "Full corpus requires LDC license; sample for testing; includes 7 genres",
        categories: [ner],
    },
    MultiNERD {
        name: "MultiNERD",
        description: "Large multilingual NER covering 10 languages. Created to address scarcity of multilingual fine-grained NER data.",
        url: "https://huggingface.co/datasets/Babelscape/multinerd/resolve/main/test/test_en.jsonl",
        entity_types: ["PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA", "MYTH", "PLANT", "TIME", "VEHI"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-SA-4.0",
        citation: "Tedeschi & Navigli (2022)",
        paper_url: "https://aclanthology.org/2022.findings-naacl.60/",
        year: 2022,
        format: "JSONL",
        annotation_scheme: "BIO",
        size_hint: "~1M sentences across 10 languages",
        example: "Marie Curie (PER) discovered radium at the University of Paris (ORG) in France (LOC).",
        tasks: ["ner"],
        hf_id: "Babelscape/multinerd",
        categories: [ner, multilingual],
    },

    FewNERD {
        name: "Few-NERD",
        description: "Fine-grained NER with 66 types in 8 coarse categories. Designed for few-shot learning evaluation.",
        url: "https://huggingface.co/datasets/DFKI-SLT/few-nerd",
        entity_types: ["person", "location", "organization", "building", "art", "product", "event", "other"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-SA-4.0",
        citation: "Ding et al. (2021)",
        paper_url: "https://aclanthology.org/2021.acl-long.248/",
        year: 2021,
        format: "TSV",
        size_hint: "188k sentences, 66 fine-grained types",
        example: "Jensen Huang (person-entrepreneur) founded NVIDIA (organization-company) in Santa Clara (location-city), California.",
        notes: "Hierarchical type system; benchmark for few-shot and fine-grained NER. Note: HuggingFace API may return 422 errors; consider downloading locally.",
        tasks: ["ner"],
        hf_id: "DFKI-SLT/few-nerd",
        categories: [ner],
    },

    CrossNER {
        name: "CrossNER",
        description: "Cross-domain NER across 5 domains: politics, science, music, literature, AI. Tests domain transfer.",
        url: "https://huggingface.co/datasets/DFKI-SLT/cross_ner",
        entity_types: ["PER", "ORG", "LOC", "MISC", "Domain-specific"],
        language: "en",
        domain: "multi-domain",
        license: "MIT",
        citation: "Liu et al. (2021)",
        paper_url: "https://aclanthology.org/2021.aaai.main.672/",
        year: 2021,
        format: "CoNLL",
        size_hint: "5 domains, ~10k sentences each",
        notes: "Tests cross-domain transfer; domain-specific entity types. Use HuggingFace datasets library to load.",
        hf_id: "DFKI-SLT/cross_ner",
        hf_config: "politics",
        categories: [ner],
    },

    FabNER {
        name: "FabNER",
        description: "Manufacturing domain NER. 12 entity types for Industry 4.0 applications.",
        url: "https://huggingface.co/datasets/DFKI-SLT/fabner",
        entity_types: ["Material", "Process", "Machine", "Product", "Property"],
        language: "en",
        domain: "manufacturing",
        license: "CC-BY-4.0",
        citation: "Kumar et al. (2022)",
        paper_url: "https://aclanthology.org/2022.lrec-1.227/",
        year: 2022,
        format: "CoNLL",
        size_hint: "~14k sentences, 12 entity types",
        notes: "Specialized manufacturing/engineering domain; Industry 4.0",
        hf_id: "DFKI-SLT/fabner",
        categories: [ner],
    },

    BroadTwitterCorpus {
        name: "Broad Twitter Corpus",
        description: "Twitter NER across multiple time periods. Tests temporal robustness of NER systems.",
        url: "https://huggingface.co/datasets/tner/btc",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Derczynski et al. (2016)",
        paper_url: "https://aclanthology.org/C16-1111/",
        year: 2016,
        format: "BIO",
        size_hint: "~9k tweets, stratified by time period",
        notes: "Temporal stratification; tests model robustness to language evolution",
        hf_id: "tner/btc",
        access_status: HuggingFace,
        categories: [ner, social_media],
    },

    WikiNeural {
        name: "WikiNeural",
        description: "Silver-standard multilingual NER from Wikipedia. 9 languages with automatic annotation.",
        url: "https://huggingface.co/datasets/Babelscape/wikineural",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "mul",
        domain: "wikipedia",
        license: "CC-BY-SA-4.0",
        citation: "Tedeschi et al. (2021)",
        paper_url: "https://aclanthology.org/2021.findings-emnlp.215/",
        year: 2021,
        format: "CoNLL",
        size_hint: "9 languages, ~100k sentences each",
        notes: "Automatically generated silver annotations; useful for pre-training",
        hf_id: "Babelscape/wikineural",
        hf_config: "en",
        categories: [ner, multilingual],
    },

    PolyglotNER {
        name: "Polyglot-NER",
        description: "Massively multilingual NER. 40 languages with silver annotations from Wikipedia.",
        url: "https://huggingface.co/datasets/rmyeid/polyglot_ner",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "wikipedia",
        license: "Research",
        citation: "Al-Rfou et al. (2015)",
        paper_url: "https://aclanthology.org/C14-1078/",
        year: 2015,
        format: "CoNLL",
        size_hint: "40 languages, silver annotations",
        notes: "Largest language coverage; silver annotations via Wikipedia links",
        tasks: ["ner"],
        hf_id: "rmyeid/polyglot_ner",
        access_status: Public,
        categories: [ner, multilingual],
    },

    UniversalNERBench {
        name: "Universal NER",
        description: "Cross-lingual NER benchmark spanning 13 diverse languages. Tests zero-shot transfer.",
        url: "https://github.com/UniversalNER/uner_code",
        entity_types: ["PER", "LOC", "ORG"],
        language: "mul",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Malmasi et al. (2022)",
        paper_url: "https://aclanthology.org/2022.emnlp-main.13/",
        year: 2022,
        format: "CoNLL",
        size_hint: "13 languages, gold annotations",
        notes: "Tests cross-lingual zero-shot transfer; diverse language families",
        categories: [ner, multilingual],
    },

    CoNLL2002 {
        name: "CoNLL-2002",
        description: "Spanish and Dutch NER from CoNLL 2002 shared task. Multi-language NER benchmark.",
        url: "https://huggingface.co/datasets/eriktks/conll2002",
        entity_types: ["PER", "LOC", "ORG", "MISC"],
        language: "mul",
        domain: "news",
        license: "Research",
        citation: "Tjong Kim Sang (2002)",
        paper_url: "https://aclanthology.org/W02-2024/",
        year: 2002,
        format: "CoNLL",
        annotation_scheme: "BIO",
        size_hint: "Spanish + Dutch news articles",
        notes: "First multilingual NER shared task; established CoNLL NER format",
        tasks: ["ner"],
        hf_id: "eriktks/conll2002",
        access_status: Public,
        categories: [ner, multilingual],
    },

    TweetNER7 {
        name: "TweetNER7",
        description: "Twitter NER across 7 entity types. Fine-grained social media NER with temporal annotations.",
        url: "https://huggingface.co/datasets/tner/tweetner7",
        entity_types: ["person", "location", "corporation", "product", "creative_work", "group", "event"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Ushio et al. (2022)",
        paper_url: "https://aclanthology.org/2022.findings-emnlp.304/",
        year: 2022,
        format: "JSONL",
        size_hint: "~12k tweets",
        notes: "Temporal distribution shift; tests robustness to evolving language",
        hf_id: "tner/tweetner7",
        hf_config: "tweetner7",
        categories: [ner, social_media],
    },

    GoogleRE {
        name: "Google-RE",
        description: "Google Relation Extraction dataset. Wikipedia sentences with relation annotations.",
        url: "https://raw.githubusercontent.com/google-research-datasets/relation-extraction-corpus/master/20130403-place_of_birth.json",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-4.0",
        citation: "Levy et al. (2017)",
        paper_url: "https://aclanthology.org/D17-1004/",
        year: 2017,
        format: "JSON",
        size_hint: "~60k relation triples",
        notes: "Clean relation extraction; commonly used for zero-shot RE evaluation; using place_of_birth subset",
        tasks: ["re"],
        access_status: Public,
        categories: [relation_extraction],
    },

    NYTFB {
        name: "NYT-FB",
        description: "New York Times with Freebase relations. Distant supervision relation extraction.",
        url: "https://github.com/thunlp/OpenNRE",
        entity_types: ["PER", "LOC", "ORG"],
        language: "en",
        domain: "news",
        license: "Research",
        citation: "Riedel et al. (2010)",
        paper_url: "https://aclanthology.org/N10-1114/",
        year: 2010,
        format: "JSONL",
        size_hint: "~570k sentences, 53 relations",
        notes: "Classic distant supervision RE; noisy but large-scale",
        categories: [relation_extraction],
    },

    REBEL {
        name: "REBEL",
        description: "Relation Extraction By End-to-end Language generation. Large-scale RE dataset.",
        url: "https://huggingface.co/datasets/Babelscape/rebel-dataset",
        entity_types: ["PER", "LOC", "ORG", "Event"],
        language: "en",
        domain: "wikipedia",
        license: "CC-BY-SA-4.0",
        citation: "Huguet Cabot & Navigli (2021)",
        paper_url: "https://aclanthology.org/2021.findings-emnlp.204/",
        year: 2021,
        format: "JSONL",
        size_hint: "~6M triples from Wikipedia",
        notes: "Large-scale; generative RE approach; 220 relation types",
        categories: [relation_extraction],
    },

    MultiCoNER {
        name: "MultiCoNER",
        description: "Multilingual Complex NER. 11 languages with fine-grained and complex entities.",
        url: "https://huggingface.co/datasets/samanjoy2/multiconer_v1",
        entity_types: ["PER", "LOC", "CORP", "GRP", "PROD", "CW"],
        language: "mul",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Malmasi et al. (2022)",
        paper_url: "https://aclanthology.org/2022.semeval-1.196/",
        year: 2022,
        format: "CoNLL",
        size_hint: "11 languages, ~1.1M tokens",
        notes: "SemEval-2022 shared task; complex entities from diverse sources. Community mirror of original.",
        hf_id: "samanjoy2/multiconer_v1",
        access_status: Public,
        categories: [ner, multilingual],
    },

    MultiCoNERv2 {
        name: "MultiCoNER v2",
        description: "MultiCoNER v2 with expanded languages and fine-grained types.",
        url: "https://huggingface.co/datasets/MultiCoNER/multiconer_v2",
        entity_types: ["PER", "LOC", "CORP", "GRP", "PROD", "CW", "Medical", "Scientist"],
        language: "mul",
        domain: "mixed",
        license: "CC-BY-4.0",
        citation: "Fetahu et al. (2023)",
        paper_url: "https://aclanthology.org/2023.semeval-1.43/",
        year: 2023,
        format: "CoNLL",
        size_hint: "12 languages, fine-grained types",
        notes: "SemEval-2023 shared task; expanded from v1 with more types.",
        tasks: ["ner"],
        hf_id: "MultiCoNER/multiconer_v2",
        access_status: Public,
        categories: [ner, multilingual],
    },

    // =========================================================================
    // PII / Privacy NER
    // =========================================================================
    NemotronPII {
        name: "Nemotron-PII",
        description: "Synthetic PII dataset with 55+ categories spanning personal, financial, healthcare, and demographic data.",
        url: "https://huggingface.co/datasets/nvidia/Nemotron-PII",
        entity_types: ["first_name", "last_name", "ssn", "email", "phone_number", "credit_debit_card", "street_address", "date_of_birth", "medical_record_number"],
        language: "en",
        domain: "multi-industry",
        license: "CC-BY-4.0",
        citation: "NVIDIA (2025)",
        paper_url: "https://huggingface.co/datasets/nvidia/Nemotron-PII",
        year: 2025,
        format: "JSON",
        size_hint: "200k records (100k train / 100k test), 55+ PII categories",
        notes: "Character-offset span annotations. Covers HIPAA and GDPR entity classes. Synthetic, multi-industry.",
        tasks: ["ner"],
        hf_id: "nvidia/Nemotron-PII",
        access_status: HuggingFace,
        categories: [ner],
    },

    // =========================================================================
    // Temporal NER
    // =========================================================================
    NamedTimexes {
        name: "Named Temporal Expressions",
        description: "Temporal NER for culturally-named time expressions (Michaelmas, Vasant Panchami) that standard TIMEX3 tools miss.",
        url: "https://huggingface.co/datasets/strombergnlp/named_timexes",
        entity_types: ["T"],
        language: "en",
        domain: "social_media",
        license: "CC-BY-4.0",
        citation: "Stromberg Derczynski (2023)",
        paper_url: "https://huggingface.co/datasets/strombergnlp/named_timexes",
        year: 2023,
        format: "IOB2",
        annotation_scheme: "IOB2",
        size_hint: "~117k tokens (87k train / 30k test)",
        notes: "Binary O/T labels. Targets named temporal expressions, not numeric dates. HuggingFace native.",
        tasks: ["ner", "temporal"],
        hf_id: "strombergnlp/named_timexes",
        access_status: HuggingFace,
        categories: [ner],
    },

    // =========================================================================
    // Cyber Threat Intelligence NER
    // =========================================================================
    CyberThreatIntelligence {
        name: "Cyber Threat Intelligence NER",
        description: "NER on Palo Alto Networks threat reports. Malware, attack patterns, threat actors, and software entities.",
        url: "https://huggingface.co/datasets/mrmoor/cyber-threat-intelligence",
        entity_types: ["malware", "attack-pattern", "threat-actor", "identity", "SOFTWARE", "location", "TIME"],
        language: "en",
        domain: "cybersecurity",
        license: "CC-BY-4.0",
        citation: "Moor (2024)",
        paper_url: "https://huggingface.co/datasets/mrmoor/cyber-threat-intelligence",
        year: 2024,
        format: "JSON",
        size_hint: "9,732 records from real threat reports",
        notes: "Character-offset span annotations with relations. Derived from real threat intelligence, not synthetic.",
        tasks: ["ner", "re"],
        hf_id: "mrmoor/cyber-threat-intelligence",
        access_status: HuggingFace,
        categories: [ner, relation_extraction],
    },