WikiGold {
name: "WikiGold",
description: "Wikipedia-based NER (PER, LOC, ORG, MISC). Historically significant as early Wikipedia NER resource.",
url: "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/wikigold.conll.txt",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "wikipedia",
license: "CC-BY-4.0",
citation: "Balasuriya et al. (2009)",
paper_url: "https://aclanthology.org/U09-1001/",
year: 2009,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~40k tokens, ~3,500 entities",
example: "Japan B-LOC\n's O\nMinister O\nShinzo B-PER\nAbe I-PER\nvisited O\nthe O\nUnited B-LOC\nStates I-LOC\n. O",
splits: ["all"],
tasks: ["ner"],
expected_docs: 145,
categories: [ner],
},
Wnut17 {
name: "WNUT-17",
description: "Social media NER with emerging entities. Created to evaluate models on rare/emerging entities in noisy social text.",
url: "https://raw.githubusercontent.com/leondz/emerging_entities_17/master/emerging.test.annotated",
entity_types: ["person", "location", "corporation", "product", "creative-work", "group"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Derczynski et al. (2017)",
paper_url: "https://aclanthology.org/W17-4418/",
year: 2017,
format: "CoNLL",
annotation_scheme: "BIO",
size_hint: "~65k tokens, 1,000 tweets",
notes: "89% unseen entities in test set - excellent for OOD evaluation; shared task at W-NUT workshop",
tasks: ["ner"],
hf_id: "leondz/wnut_17",
categories: [ner, social_media],
},
MitMovie {
name: "MIT Movie",
description: "Movie domain slot filling NER. Created at MIT SLS for spoken language understanding research.",
url: "https://sls.csail.mit.edu/downloads/movie/engtest.bio",
entity_types: ["Actor", "Director", "Genre", "Title", "Year", "Song", "Character", "Plot", "Rating"],
language: "en",
domain: "entertainment",
license: "Research",
citation: "Liu et al. (2013)",
paper_url: "https://sls.csail.mit.edu/publications/2013/Liu_ASRU_2013.pdf",
year: 2013,
format: "BIO",
annotation_scheme: "BIO",
size_hint: "~12k utterances",
example: "show O\nme O\naction B-Genre\nmovies O\ndirected O\nby O\nsteven B-Director\nspielberg I-Director",
tasks: ["ner", "slot_filling"],
categories: [ner],
},
MitRestaurant {
name: "MIT Restaurant",
description: "Restaurant domain slot filling NER. Part of MIT SLS spoken dialogue systems research.",
url: "https://sls.csail.mit.edu/downloads/restaurant/restauranttest.bio",
entity_types: ["Amenity", "Cuisine", "Dish", "Hours", "Location", "Price", "Rating", "Restaurant_Name"],
language: "en",
domain: "restaurant",
license: "Research",
citation: "Liu et al. (2013)",
paper_url: "https://sls.csail.mit.edu/publications/2013/Liu_ASRU_2013.pdf",
year: 2013,
format: "BIO",
annotation_scheme: "BIO",
size_hint: "~8k utterances",
example: "find O\nitalian B-Cuisine\nrestaurants O\nin O\nboston B-Location\nwith O\noutdoor B-Amenity\nseating I-Amenity",
tasks: ["ner", "slot_filling"],
categories: [ner],
},
CoNLL2003Sample {
name: "CoNLL-2003 Sample",
description: "Classic news NER benchmark from Reuters Corpus. Foundational dataset that established modern NER evaluation standards.",
url: "https://raw.githubusercontent.com/autoih/conll2003/master/CoNLL-2003/eng.testb",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "news",
license: "Research",
citation: "Tjong Kim Sang & De Meulder (2003)",
paper_url: "https://aclanthology.org/W03-0419/",
year: 2003,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~300k tokens, ~35k entities",
example: "EU B-ORG\nrejects O\nGerman B-MISC\ncall O\nto O\nboycott O\nBritish B-MISC\nlamb O\n. O",
notes: "Known annotation noise; see CleanCoNLL (2023) for one audit/correction pass",
tasks: ["ner"],
categories: [ner],
},
OntoNotesSample {
name: "OntoNotes Sample",
description: "Multi-genre 18-type NER from OntoNotes 5.0. Rich annotation including coreference, parsing, and PropBank.",
url: "https://raw.githubusercontent.com/autoih/conll2003/master/CoNLL-2003/eng.testb",
entity_types: ["PERSON", "ORG", "GPE", "LOC", "DATE", "TIME", "MONEY", "PERCENT", "NORP", "FAC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "QUANTITY", "ORDINAL", "CARDINAL"],
language: "en",
domain: "news",
license: "LDC",
citation: "Weischedel et al. (2013)",
paper_url: "https://catalog.ldc.upenn.edu/LDC2013T19",
year: 2013,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~1.6M tokens, ~128k entities",
example: "The B-ORG\nEuropean I-ORG\nUnion I-ORG\nannounced O\nMonday B-DATE\nthat O\nthe O\n$ B-MONEY\n10 I-MONEY\nmillion I-MONEY\nwill O\ngo O\nto O\nUkraine B-GPE\n. O",
notes: "Full corpus requires LDC license; sample for testing; includes 7 genres",
categories: [ner],
},
MultiNERD {
name: "MultiNERD",
description: "Large multilingual NER covering 10 languages. Created to address scarcity of multilingual fine-grained NER data.",
url: "https://huggingface.co/datasets/Babelscape/multinerd/resolve/main/test/test_en.jsonl",
entity_types: ["PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA", "MYTH", "PLANT", "TIME", "VEHI"],
language: "en",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Tedeschi & Navigli (2022)",
paper_url: "https://aclanthology.org/2022.findings-naacl.60/",
year: 2022,
format: "JSONL",
annotation_scheme: "BIO",
size_hint: "~1M sentences across 10 languages",
example: "Marie Curie (PER) discovered radium at the University of Paris (ORG) in France (LOC).",
tasks: ["ner"],
hf_id: "Babelscape/multinerd",
categories: [ner, multilingual],
},
FewNERD {
name: "Few-NERD",
description: "Fine-grained NER with 66 types in 8 coarse categories. Designed for few-shot learning evaluation.",
url: "https://huggingface.co/datasets/DFKI-SLT/few-nerd",
entity_types: ["person", "location", "organization", "building", "art", "product", "event", "other"],
language: "en",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Ding et al. (2021)",
paper_url: "https://aclanthology.org/2021.acl-long.248/",
year: 2021,
format: "TSV",
size_hint: "188k sentences, 66 fine-grained types",
example: "Jensen Huang (person-entrepreneur) founded NVIDIA (organization-company) in Santa Clara (location-city), California.",
notes: "Hierarchical type system; benchmark for few-shot and fine-grained NER. Note: HuggingFace API may return 422 errors; consider downloading locally.",
tasks: ["ner"],
hf_id: "DFKI-SLT/few-nerd",
categories: [ner],
},
CrossNER {
name: "CrossNER",
description: "Cross-domain NER across 5 domains: politics, science, music, literature, AI. Tests domain transfer.",
url: "https://huggingface.co/datasets/DFKI-SLT/cross_ner",
entity_types: ["PER", "ORG", "LOC", "MISC", "Domain-specific"],
language: "en",
domain: "multi-domain",
license: "MIT",
citation: "Liu et al. (2021)",
paper_url: "https://aclanthology.org/2021.aaai.main.672/",
year: 2021,
format: "CoNLL",
size_hint: "5 domains, ~10k sentences each",
notes: "Tests cross-domain transfer; domain-specific entity types. Use HuggingFace datasets library to load.",
hf_id: "DFKI-SLT/cross_ner",
hf_config: "politics",
categories: [ner],
},
FabNER {
name: "FabNER",
description: "Manufacturing domain NER. 12 entity types for Industry 4.0 applications.",
url: "https://huggingface.co/datasets/DFKI-SLT/fabner",
entity_types: ["Material", "Process", "Machine", "Product", "Property"],
language: "en",
domain: "manufacturing",
license: "CC-BY-4.0",
citation: "Kumar et al. (2022)",
paper_url: "https://aclanthology.org/2022.lrec-1.227/",
year: 2022,
format: "CoNLL",
size_hint: "~14k sentences, 12 entity types",
notes: "Specialized manufacturing/engineering domain; Industry 4.0",
hf_id: "DFKI-SLT/fabner",
categories: [ner],
},
BroadTwitterCorpus {
name: "Broad Twitter Corpus",
description: "Twitter NER across multiple time periods. Tests temporal robustness of NER systems.",
url: "https://huggingface.co/datasets/tner/btc",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Derczynski et al. (2016)",
paper_url: "https://aclanthology.org/C16-1111/",
year: 2016,
format: "BIO",
size_hint: "~9k tweets, stratified by time period",
notes: "Temporal stratification; tests model robustness to language evolution",
hf_id: "tner/btc",
access_status: HuggingFace,
categories: [ner, social_media],
},
WikiNeural {
name: "WikiNeural",
description: "Silver-standard multilingual NER from Wikipedia. 9 languages with automatic annotation.",
url: "https://huggingface.co/datasets/Babelscape/wikineural",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "mul",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Tedeschi et al. (2021)",
paper_url: "https://aclanthology.org/2021.findings-emnlp.215/",
year: 2021,
format: "CoNLL",
size_hint: "9 languages, ~100k sentences each",
notes: "Automatically generated silver annotations; useful for pre-training",
hf_id: "Babelscape/wikineural",
hf_config: "en",
categories: [ner, multilingual],
},
PolyglotNER {
name: "Polyglot-NER",
description: "Massively multilingual NER. 40 languages with silver annotations from Wikipedia.",
url: "https://huggingface.co/datasets/rmyeid/polyglot_ner",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "wikipedia",
license: "Research",
citation: "Al-Rfou et al. (2015)",
paper_url: "https://aclanthology.org/C14-1078/",
year: 2015,
format: "CoNLL",
size_hint: "40 languages, silver annotations",
notes: "Largest language coverage; silver annotations via Wikipedia links",
tasks: ["ner"],
hf_id: "rmyeid/polyglot_ner",
access_status: Public,
categories: [ner, multilingual],
},
UniversalNERBench {
name: "Universal NER",
description: "Cross-lingual NER benchmark spanning 13 diverse languages. Tests zero-shot transfer.",
url: "https://github.com/UniversalNER/uner_code",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Malmasi et al. (2022)",
paper_url: "https://aclanthology.org/2022.emnlp-main.13/",
year: 2022,
format: "CoNLL",
size_hint: "13 languages, gold annotations",
notes: "Tests cross-lingual zero-shot transfer; diverse language families",
categories: [ner, multilingual],
},
CoNLL2002 {
name: "CoNLL-2002",
description: "Spanish and Dutch NER from CoNLL 2002 shared task. Multi-language NER benchmark.",
url: "https://huggingface.co/datasets/eriktks/conll2002",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "mul",
domain: "news",
license: "Research",
citation: "Tjong Kim Sang (2002)",
paper_url: "https://aclanthology.org/W02-2024/",
year: 2002,
format: "CoNLL",
annotation_scheme: "BIO",
size_hint: "Spanish + Dutch news articles",
notes: "First multilingual NER shared task; established CoNLL NER format",
tasks: ["ner"],
hf_id: "eriktks/conll2002",
access_status: Public,
categories: [ner, multilingual],
},
TweetNER7 {
name: "TweetNER7",
description: "Twitter NER across 7 entity types. Fine-grained social media NER with temporal annotations.",
url: "https://huggingface.co/datasets/tner/tweetner7",
entity_types: ["person", "location", "corporation", "product", "creative_work", "group", "event"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Ushio et al. (2022)",
paper_url: "https://aclanthology.org/2022.findings-emnlp.304/",
year: 2022,
format: "JSONL",
size_hint: "~12k tweets",
notes: "Temporal distribution shift; tests robustness to evolving language",
hf_id: "tner/tweetner7",
hf_config: "tweetner7",
categories: [ner, social_media],
},
GoogleRE {
name: "Google-RE",
description: "Google Relation Extraction dataset. Wikipedia sentences with relation annotations.",
url: "https://raw.githubusercontent.com/google-research-datasets/relation-extraction-corpus/master/20130403-place_of_birth.json",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "wikipedia",
license: "CC-BY-4.0",
citation: "Levy et al. (2017)",
paper_url: "https://aclanthology.org/D17-1004/",
year: 2017,
format: "JSON",
size_hint: "~60k relation triples",
notes: "Clean relation extraction; commonly used for zero-shot RE evaluation; using place_of_birth subset",
tasks: ["re"],
access_status: Public,
categories: [relation_extraction],
},
NYTFB {
name: "NYT-FB",
description: "New York Times with Freebase relations. Distant supervision relation extraction.",
url: "https://github.com/thunlp/OpenNRE",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "news",
license: "Research",
citation: "Riedel et al. (2010)",
paper_url: "https://aclanthology.org/N10-1114/",
year: 2010,
format: "JSONL",
size_hint: "~570k sentences, 53 relations",
notes: "Classic distant supervision RE; noisy but large-scale",
categories: [relation_extraction],
},
REBEL {
name: "REBEL",
description: "Relation Extraction By End-to-end Language generation. Large-scale RE dataset.",
url: "https://huggingface.co/datasets/Babelscape/rebel-dataset",
entity_types: ["PER", "LOC", "ORG", "Event"],
language: "en",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Huguet Cabot & Navigli (2021)",
paper_url: "https://aclanthology.org/2021.findings-emnlp.204/",
year: 2021,
format: "JSONL",
size_hint: "~6M triples from Wikipedia",
notes: "Large-scale; generative RE approach; 220 relation types",
categories: [relation_extraction],
},
MultiCoNER {
name: "MultiCoNER",
description: "Multilingual Complex NER. 11 languages with fine-grained and complex entities.",
url: "https://huggingface.co/datasets/samanjoy2/multiconer_v1",
entity_types: ["PER", "LOC", "CORP", "GRP", "PROD", "CW"],
language: "mul",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Malmasi et al. (2022)",
paper_url: "https://aclanthology.org/2022.semeval-1.196/",
year: 2022,
format: "CoNLL",
size_hint: "11 languages, ~1.1M tokens",
notes: "SemEval-2022 shared task; complex entities from diverse sources. Community mirror of original.",
hf_id: "samanjoy2/multiconer_v1",
access_status: Public,
categories: [ner, multilingual],
},
MultiCoNERv2 {
name: "MultiCoNER v2",
description: "MultiCoNER v2 with expanded languages and fine-grained types.",
url: "https://huggingface.co/datasets/MultiCoNER/multiconer_v2",
entity_types: ["PER", "LOC", "CORP", "GRP", "PROD", "CW", "Medical", "Scientist"],
language: "mul",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Fetahu et al. (2023)",
paper_url: "https://aclanthology.org/2023.semeval-1.43/",
year: 2023,
format: "CoNLL",
size_hint: "12 languages, fine-grained types",
notes: "SemEval-2023 shared task; expanded from v1 with more types.",
tasks: ["ner"],
hf_id: "MultiCoNER/multiconer_v2",
access_status: Public,
categories: [ner, multilingual],
},
NemotronPII {
name: "Nemotron-PII",
description: "Synthetic PII dataset with 55+ categories spanning personal, financial, healthcare, and demographic data.",
url: "https://huggingface.co/datasets/nvidia/Nemotron-PII",
entity_types: ["first_name", "last_name", "ssn", "email", "phone_number", "credit_debit_card", "street_address", "date_of_birth", "medical_record_number"],
language: "en",
domain: "multi-industry",
license: "CC-BY-4.0",
citation: "NVIDIA (2025)",
paper_url: "https://huggingface.co/datasets/nvidia/Nemotron-PII",
year: 2025,
format: "JSON",
size_hint: "200k records (100k train / 100k test), 55+ PII categories",
notes: "Character-offset span annotations. Covers HIPAA and GDPR entity classes. Synthetic, multi-industry.",
tasks: ["ner"],
hf_id: "nvidia/Nemotron-PII",
access_status: HuggingFace,
categories: [ner],
},
NamedTimexes {
name: "Named Temporal Expressions",
description: "Temporal NER for culturally-named time expressions (Michaelmas, Vasant Panchami) that standard TIMEX3 tools miss.",
url: "https://huggingface.co/datasets/strombergnlp/named_timexes",
entity_types: ["T"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Stromberg Derczynski (2023)",
paper_url: "https://huggingface.co/datasets/strombergnlp/named_timexes",
year: 2023,
format: "IOB2",
annotation_scheme: "IOB2",
size_hint: "~117k tokens (87k train / 30k test)",
notes: "Binary O/T labels. Targets named temporal expressions, not numeric dates. HuggingFace native.",
tasks: ["ner", "temporal"],
hf_id: "strombergnlp/named_timexes",
access_status: HuggingFace,
categories: [ner],
},
CyberThreatIntelligence {
name: "Cyber Threat Intelligence NER",
description: "NER on Palo Alto Networks threat reports. Malware, attack patterns, threat actors, and software entities.",
url: "https://huggingface.co/datasets/mrmoor/cyber-threat-intelligence",
entity_types: ["malware", "attack-pattern", "threat-actor", "identity", "SOFTWARE", "location", "TIME"],
language: "en",
domain: "cybersecurity",
license: "CC-BY-4.0",
citation: "Moor (2024)",
paper_url: "https://huggingface.co/datasets/mrmoor/cyber-threat-intelligence",
year: 2024,
format: "JSON",
size_hint: "9,732 records from real threat reports",
notes: "Character-offset span annotations with relations. Derived from real threat intelligence, not synthetic.",
tasks: ["ner", "re"],
hf_id: "mrmoor/cyber-threat-intelligence",
access_status: HuggingFace,
categories: [ner, relation_extraction],
},