ZELDA {
name: "ZELDA",
description: "Entity disambiguation benchmark. 95k Wikipedia paragraphs, 8 ED datasets unified.",
url: "https://raw.githubusercontent.com/flairNLP/zelda/main/test_data/conll/test_aida-b.conll",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Milich & Akbik (2023)",
paper_url: "https://aclanthology.org/2023.eacl-main.151/",
year: 2023,
format: "CoNLL",
size_hint: "95k paragraphs, 825k entities",
notes: "Standardized ED evaluation; Wikipedia KB; no emerging entities; using AIDA-B test subset",
splits: ["test"],
tasks: ["el", "ner"],
access_status: Public,
categories: [ner, entity_linking],
},
TweetNERD {
name: "TweetNERD",
description: "Twitter NER + Entity Linking. End-to-end NERD benchmark spanning 2010-2021.",
url: "https://zenodo.org/records/6617192",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Mishra et al. (2022)",
paper_url: "https://arxiv.org/abs/2210.08129",
year: 2022,
format: "JSONL",
size_hint: "340k+ tweets",
notes: "NeurIPS 2022; temporal drift; NER + EL + end-to-end NERD",
splits: ["train", "dev", "test"],
tasks: ["ner", "el"],
categories: [ner, social_media],
},
AIDACoNLL {
name: "AIDA-CoNLL",
description: "Primary entity linking benchmark linking CoNLL-2003 mentions to Wikipedia. De-facto standard for end-to-end EL evaluation.",
url: "https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "news",
license: "Research",
citation: "Hoffart et al. (2011)",
paper_url: "https://aclanthology.org/D11-1072/",
year: 2011,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~1,400 docs, ~34k mentions linked to Wikipedia",
notes: "Built on Reuters CoNLL-2003; AIDA-train/A/B splits; foundational EL benchmark; YAGO KB",
splits: ["train", "testa", "testb"],
tasks: ["ner", "el", "entity_linking", "ned"],
categories: [ner, entity_linking],
},
ACE2005 {
name: "ACE 2005",
description: "Automatic Content Extraction 2005. Nested NER + relations + events.",
url: "", entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "WEA", "VEH"],
language: "en",
domain: "news",
license: "LDC",
citation: "Walker et al. (2006)",
paper_url: "https://catalog.ldc.upenn.edu/LDC2006T06",
year: 2005,
format: "XML",
annotation_scheme: "Standoff",
size_hint: "~600 documents",
notes: "Gold standard for nested NER; includes Arabic/Chinese; defines modern IE evaluation",
access_status: Registration,
categories: [ner, nested_ner, relation_extraction],
},
NNE {
name: "NNE (Nested Named Entities)",
description: "Large-scale nested NER corpus from Wikipedia/news. Deep nesting up to 6 levels.",
url: "https://github.com/nickyringland/nested_named_entities",
entity_types: ["PER", "LOC", "ORG", "GPE", "NORP", "FAC", "PRODUCT", "EVENT", "WORK", "LAW"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "Ringland et al. (2019)",
paper_url: "https://aclanthology.org/P19-1510/",
year: 2019,
format: "CoNLL",
size_hint: "~280k tokens, deep nesting",
notes: "ACL 2019; based on ACE/OntoNotes; up to 6 nested levels; stress test for nested NER",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, nested_ner],
},
GENIANested {
name: "GENIA Nested",
description: "Biomedical nested NER from GENIA corpus. Up to 3 levels of nesting.",
url: "https://raw.githubusercontent.com/thecharm/boundary-aware-nested-ner/master/Our_boundary-aware_model/data/genia/genia.test.iob2",
entity_types: ["DNA", "RNA", "PROTEIN", "CELL_LINE", "CELL_TYPE"],
language: "en",
domain: "biomedical",
license: "GENIA Project License",
citation: "Kim et al. (2003)",
paper_url: "https://aclanthology.org/W03-1302/",
year: 2003,
format: "CoNLL",
size_hint: "~2k abstracts",
example: "[[IL-2 receptor] alpha chain] promoter\n[IL-2 receptor]: PROTEIN, [IL-2 receptor alpha chain]: PROTEIN (nested)",
notes: "Canonical biomedical nested NER benchmark; used alongside ACE for nested NER evaluation",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, nested_ner, biomedical],
},
ChineseNestedNER {
name: "Chinese Nested NER",
description: "Chinese nested named entity recognition. Multiple levels of embedded entities.",
url: "https://github.com/LeeSureman/Nested-NER",
entity_types: ["PER", "ORG", "LOC", "GPE"],
language: "zh",
domain: "news",
license: "CC-BY-4.0",
citation: "Wang et al. (2020)",
year: 2020,
format: "JSONL",
size_hint: "~20k sentences",
notes: "Chinese nested NER benchmark; designed for span-based model evaluation; CJK characters",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, nested_ner, multilingual],
},
SCINERNested {
name: "SciNER Nested",
description: "Scientific paper NER with nested annotations. Methods, tasks, and datasets.",
url: "https://github.com/allenai/sciie",
entity_types: ["TASK", "METHOD", "METRIC", "MATERIAL", "GENERIC"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Luan et al. (2018)",
paper_url: "https://aclanthology.org/D18-1360/",
year: 2018,
format: "JSONL",
size_hint: "~500 abstracts",
notes: "Scientific information extraction; nested spans common in methodology descriptions",
splits: ["train", "dev", "test"],
tasks: ["ner", "re"],
categories: [ner, nested_ner, arcane_domain],
},
ShAReCLEF {
name: "ShARe/CLEF",
description: "Shared Annotated Resources for clinical NER. ShARe/CLEF eHealth shared task.",
url: "", entity_types: ["DISORDER", "FINDING", "PROCEDURE"],
language: "en",
domain: "clinical",
license: "PhysioNet",
citation: "Pradhan et al. (2013)",
paper_url: "https://aclanthology.org/S13-2056/",
year: 2013,
format: "BRAT",
annotation_scheme: "Standoff",
size_hint: "~300 clinical notes",
notes: "Discontinuous clinical entities; SNOMED-CT normalization; de-identified records",
access_status: Registration,
categories: [ner, biomedical, discontinuous_ner],
},
GermEvalDiscontinuous {
name: "GermEval Discontinuous",
description: "German discontinuous NER from GermEval 2014. Non-contiguous entity spans.",
url: "https://sites.google.com/site/germaboreval/data",
entity_types: ["PER", "ORG", "LOC", "OTH"],
language: "de",
domain: "news",
license: "CC-BY-4.0",
citation: "Benikova et al. (2014)",
paper_url: "https://aclanthology.org/W14-1707/",
year: 2014,
format: "CoNLL",
size_hint: "~87k tokens",
notes: "German discontinuous entities; derived entities; embedded entities",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, discontinuous_ner, multilingual],
},
ADRDiscontinuous {
name: "ADR Discontinuous",
description: "Adverse Drug Reaction corpus with discontinuous mentions. Patient forum posts.",
url: "https://github.com/Aitslab/ADR-DisNER",
entity_types: ["ADR", "DRUG", "SYMPTOM"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Metke-Jimenez et al. (2016)",
year: 2016,
format: "BRAT",
size_hint: "~2k posts",
notes: "Social media ADR mentions; many discontinuous spans; health forum text",
categories: [ner, biomedical, discontinuous_ner, social_media],
},
PubMedDiscontinuous {
name: "PubMed Discontinuous",
description: "PubMed abstracts with discontinuous biomedical entities. Complex entity boundaries.",
url: "https://github.com/dmis-lab/discontinuous-ner",
entity_types: ["CHEMICAL", "DISEASE", "GENE"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Dai et al. (2020)",
year: 2020,
format: "CoNLL",
size_hint: "~8k abstracts",
notes: "Scientific abstracts; discontinuous chemical and disease mentions",
categories: [ner, biomedical, discontinuous_ner],
},
TACRED {
name: "TACRED",
description: "TAC Relation Extraction Dataset. 42 relations from TAC KBP.",
url: "", entity_types: ["PER", "ORG"],
language: "en",
domain: "news",
license: "LDC",
citation: "Zhang et al. (2017)",
paper_url: "https://aclanthology.org/D17-1004/",
year: 2017,
format: "JSONL",
size_hint: "106k examples",
example: "subj: 'Tim Cook', obj: 'Apple', relation: per:employee_of, text: 'Tim Cook is the CEO of Apple Inc.'",
notes: "42 relations; majority no_relation; known label noise; Re-TACRED fixes some issues",
access_status: Registration,
categories: [relation_extraction],
},
SemEval2010Task8 {
name: "SemEval-2010 Task 8",
description: "Semantic relation classification between nominals. 9 relation types.",
url: "https://github.com/sahitya0000/Relation-Classification",
entity_types: ["e1", "e2"], language: "en",
domain: "mixed",
license: "Research",
citation: "Hendrickx et al. (2010)",
paper_url: "https://aclanthology.org/S10-1006/",
year: 2010,
format: "Custom",
size_hint: "~10k examples",
notes: "Classic RE benchmark; 9 directed relations + OTHER; small but influential",
categories: [relation_extraction],
},
FewRel {
name: "FewRel",
description: "Few-shot relation classification benchmark. 100 relations from Wikidata.",
url: "https://raw.githubusercontent.com/thunlp/FewRel/master/data/val_wiki.json",
entity_types: ["head", "tail"], language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Han et al. (2018)",
paper_url: "https://aclanthology.org/D18-1514/",
year: 2018,
format: "JSONL",
size_hint: "70k instances, 100 relations",
notes: "N-way K-shot evaluation; Wikidata relations; FewRel 2.0 adds domain adaptation",
hf_id: "few_rel",
categories: [relation_extraction],
},
NYT10 {
name: "NYT-10",
description: "New York Times distant supervision RE. 24 Freebase relations.",
url: "http://iesl.cs.umass.edu/riedel/ecml/",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "news",
license: "Research",
citation: "Riedel et al. (2010)",
paper_url: "https://aclanthology.org/W10-1001/",
year: 2010,
format: "Custom",
size_hint: "~266k sentences",
notes: "Distant supervision using Freebase alignment; distantly supervised; noisy labels; majority no_relation; standard DS-RE benchmark",
splits: ["train", "test"],
tasks: ["re"],
categories: [relation_extraction],
},
JNLPBA {
name: "JNLPBA",
description: "JNLPBA 2004 shared task. Bio-entity recognition in PubMed abstracts.",
url: "https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/JNLPBA/test.tsv",
entity_types: ["PROTEIN", "DNA", "RNA", "CELL_TYPE", "CELL_LINE"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Kim et al. (2004)",
paper_url: "https://aclanthology.org/W04-1213/",
year: 2004,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~2,400 abstracts",
notes: "Extended GENIA categories; foundational bioNER benchmark",
categories: [ner, biomedical],
},
S800 {
name: "S800",
description: "Species-800 corpus. Species name recognition in biomedical text.",
url: "https://species.jensenlab.org/files/S800-1.0.tar.gz",
entity_types: ["SPECIES"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Pafilis et al. (2013)",
paper_url: "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0065390",
year: 2013,
format: "XML",
size_hint: "800 abstracts",
notes: "Species NER; taxonomy normalization; tar.gz archive with XML format; requires manual extraction and conversion to CoNLL",
access_status: DependsOnOther,
categories: [ner, biomedical],
},
TempEval3 {
name: "TempEval-3",
description: "Temporal annotation benchmark. TIMEX, EVENT spans, and temporal relations.",
url: "https://figshare.com/articles/dataset/TempEval-3_data/9586532",
entity_types: ["TIMEX", "EVENT"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "UzZaman et al. (2013)",
paper_url: "https://aclanthology.org/S13-2001/",
year: 2013,
format: "TimeML",
notes: "Time expression NER + event detection + temporal ordering; TimeBank based; TE3-Platinum gold standard",
splits: ["train", "test"],
tasks: ["ner", "temporal"],
categories: [ner],
},
TimeBank12 {
name: "TimeBank 1.2",
description: "Canonical temporal IE corpus. News articles with TIMEX3, events, and temporal links (TLINKs).",
url: "https://catalog.ldc.upenn.edu/LDC2006T08",
entity_types: ["TIMEX3", "EVENT", "SIGNAL"],
language: "en",
domain: "news",
license: "LDC",
citation: "Pustejovsky et al. (2003)",
paper_url: "https://aclanthology.org/W03-1808/",
year: 2003,
format: "TimeML",
size_hint: "183 news documents, ~9k events",
notes: "Original TimeML corpus; basis for TempEval shared tasks; temporal ordering gold standard",
splits: ["train", "test"],
tasks: ["ner", "temporal", "events"],
categories: [ner],
},
MATRES {
name: "MATRES",
description: "Multi-Axis Temporal Relations. Cleaner, more consistent event-event temporal relation annotations.",
url: "https://github.com/qiangning/MATRES",
entity_types: ["EVENT"],
language: "en",
domain: "news",
license: "Research",
citation: "Ning et al. (2018)",
paper_url: "https://aclanthology.org/P18-1212/",
year: 2018,
format: "Custom",
size_hint: "~13.5k temporal relation pairs",
notes: "Re-annotated TimeBank/AQUAINT subset; higher inter-annotator agreement; verb-centric",
splits: ["train", "dev", "test"],
tasks: ["temporal", "events", "re"],
categories: [ner, relations],
},
THYME {
name: "THYME",
description: "Temporal Histories of Your Medical Events. Clinical temporal IE with events and relations.",
url: "",
entity_types: ["EVENT", "TIMEX3", "SECTIONTIME", "DOCTIME"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Styler et al. (2014)",
paper_url: "https://aclanthology.org/L14-1393/",
year: 2014,
format: "Custom",
size_hint: "~600 clinical notes (colon cancer, brain cancer)",
notes: "THYME guidelines; clinical events, temporal expressions, narrative containers; Clinical TempEval basis",
splits: ["train", "dev", "test"],
tasks: ["ner", "temporal", "events"],
access_status: Registration,
categories: [ner, clinical, biomedical],
},
I2B2Temporal {
name: "i2b2 2012 Temporal",
description: "Clinical temporal relations challenge. Events, TIMEX3, and TLINKs in discharge summaries.",
url: "",
entity_types: ["EVENT", "TIMEX3"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Sun et al. (2013)",
paper_url: "https://aclanthology.org/S13-2035/",
year: 2012,
format: "Custom",
size_hint: "~310 clinical notes",
notes: "i2b2 2012 challenge; requires DUA; clinical temporal relation extraction benchmark",
splits: ["train", "test"],
tasks: ["ner", "temporal", "re"],
access_status: Registration,
categories: [ner, clinical, biomedical, relations],
},
Twitter2015MNER {
name: "Twitter-2015 MNER",
description: "Multimodal NER on Twitter. Text + image for entity recognition.",
url: "https://github.com/jefferyYu/UMT",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "social_media",
license: "Research",
citation: "Zhang et al. (2018)",
paper_url: "https://aclanthology.org/N18-1078/",
year: 2018,
format: "CoNLL",
size_hint: "~8,000 tweets with images",
notes: "Multimodal; images via Google Drive archive; UMT preprocessing; first MNER dataset; visual context aids entity recognition",
splits: ["train", "dev", "test"],
tasks: ["ner", "mner"],
categories: [ner, social_media, multimodal],
},
DistantListeningCorpus {
name: "Distant Listening Corpus",
description: "1,283 musical scores with harmonic annotations. String quartet + piano music with Roman numeral analysis.",
url: "https://zenodo.org/records/15150283",
entity_types: ["CHORD", "KEY", "MODULATION", "CADENCE", "PHRASE"],
language: "mul",
domain: "music",
license: "CC-BY-4.0",
citation: "Devaney et al. (2024)",
paper_url: "https://doi.org/10.5281/zenodo.15150283",
year: 2024,
format: "TSV",
size_hint: "1,283 scores, 190k+ annotations",
notes: "Music theory annotation corpus; Roman numeral analysis; supports harmonic sequence extraction; Zenodo archive",
splits: ["train"],
tasks: ["sequence_labeling", "harmonic_analysis"],
categories: [ner, arcane_domain],
},
PIIMasking200k {
name: "PII Masking 200k",
description: "200k synthetic examples for PII detection and masking. Covers 50+ PII types.",
url: "https://huggingface.co/datasets/ai4privacy/pii-masking-200k",
entity_types: ["EMAIL", "PHONE", "SSN", "ADDRESS", "NAME", "DOB", "CREDIT_CARD", "PASSPORT", "IP_ADDRESS", "LICENSE"],
language: "mul",
domain: "privacy",
license: "Apache-2.0",
citation: "AI4Privacy (2024)",
year: 2024,
format: "JSONL",
size_hint: "~200k examples",
notes: "Synthetic PII dataset; multi-language; 50+ entity types; useful for privacy compliance testing",
splits: ["train"],
tasks: ["ner", "pii_detection"],
hf_id: "ai4privacy/pii-masking-200k",
categories: [ner],
},
ENERSec {
name: "E-NER SEC",
description: "Legal NER from SEC EDGAR filings. 52 documents with financial entity annotations.",
url: "https://github.com/jnishii/E-NER",
entity_types: ["ORG", "LOC", "DATE", "MONEY", "PERCENT", "PERSON", "PRODUCT", "CARDINAL"],
language: "en",
domain: "legal",
license: "MIT",
citation: "Nishii et al. (2023)",
year: 2023,
format: "CSV",
size_hint: "52 documents, ~400k tokens",
notes: "SEC 10-K and 10-Q filings; financial regulatory domain; legal entity extraction",
splits: ["train", "test"],
tasks: ["ner"],
categories: [ner, arcane_domain],
},
MSNBCEL {
name: "MSNBC",
description: "Small news article entity linking dataset. Commonly used for out-of-domain EL evaluation.",
url: "",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "news",
license: "Research",
citation: "Cucerzan (2007)",
paper_url: "https://aclanthology.org/D07-1074/",
year: 2007,
format: "Custom",
size_hint: "~20 docs, ~700 mentions",
notes: "Early EL benchmark; often used as OOD test set alongside AIDA",
splits: ["test"],
tasks: ["el", "entity_linking", "ned"],
access_status: ContactAuthors,
categories: [entity_linking],
},
AQUAINT {
name: "AQUAINT",
description: "Newswire entity linking dataset from AQUAINT corpus. Wikipedia-linked mentions.",
url: "",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "news",
license: "LDC",
citation: "Milne & Witten (2008)",
year: 2008,
format: "Custom",
size_hint: "~50 docs, ~700 mentions",
notes: "Commonly paired with AIDA for comprehensive EL evaluation",
splits: ["test"],
tasks: ["el", "entity_linking", "ned"],
access_status: Registration,
categories: [entity_linking],
},
KORE50 {
name: "KORE50",
description: "Short, highly ambiguous entity linking snippets. Tests disambiguation difficulty.",
url: "https://github.com/KORE50/KORE50-NIF-NER",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "Hoffart et al. (2012)",
paper_url: "https://aclanthology.org/P12-1084/",
year: 2012,
format: "Custom",
size_hint: "50 sentences, 144 mentions",
notes: "Highly ambiguous mentions; stress-tests disambiguation ability; includes YAGO types",
splits: ["test"],
tasks: ["el", "entity_linking", "ned"],
categories: [entity_linking, adversarial],
},
WNEDWiki {
name: "WNED-WIKI",
description: "Large-scale Wikipedia entity linking dataset extracted from Wikipedia hyperlinks.",
url: "https://github.com/wikipedia2vec/wikipedia2vec",
entity_types: ["ENTITY"],
language: "en",
domain: "wikipedia",
license: "Research",
citation: "Guo & Barbosa (2018)",
year: 2018,
format: "Custom",
size_hint: "~6M mentions",
notes: "Large-scale silver annotations from Wikipedia hyperlinks",
splits: ["test"],
tasks: ["el", "entity_linking"],
categories: [entity_linking],
},
WNEDClueweb {
name: "WNED-ClueWeb",
description: "Web-scale entity linking from ClueWeb corpus. Tests EL on noisy web text.",
url: "",
entity_types: ["ENTITY"],
language: "en",
domain: "web",
license: "Research",
citation: "Guo & Barbosa (2018)",
year: 2018,
format: "Custom",
size_hint: "~10k docs",
notes: "Web-scale EL benchmark; tests robustness on noisy web text",
splits: ["test"],
tasks: ["el", "entity_linking"],
access_status: Registration,
categories: [entity_linking],
},
BELB {
name: "BELB",
description: "Biomedical Entity Linking Benchmark unifying 11 corpora across 7 knowledge bases. Standardized biomedical EL evaluation.",
url: "https://github.com/sg-wbi/belb",
entity_types: ["Disease", "Chemical", "Gene", "Species", "CellLine", "Variant"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Furrer et al. (2023)",
paper_url: "https://academic.oup.com/bioinformatics/article/39/11/btad698/7425450",
year: 2023,
format: "JSONL",
size_hint: "11 corpora, 7 KBs",
notes: "Unifies BC5CDR-Chemical, BC5CDR-Disease, NCBI-Disease, BC2GN, NLM-Gene, Linnaeus, S800, GNORMPLUS, MedMentions, and more",
splits: ["train", "dev", "test"],
tasks: ["el", "entity_linking", "ned"],
categories: [entity_linking, biomedical],
},
MELO {
name: "MELO",
description: "Multilingual Entity Linking of Occupations. 48 datasets across 21 languages for occupation EL.",
url: "https://github.com/avature/melo-benchmark",
entity_types: ["OCCUPATION"],
language: "mul",
domain: "general",
license: "Apache-2.0",
citation: "Retyk et al. (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.889/",
year: 2024,
format: "JSONL",
size_hint: "48 datasets, 21 languages",
notes: "Zero-shot multilingual EL; includes sentence encoders and lexical baselines",
splits: ["test"],
tasks: ["el", "entity_linking"],
categories: [entity_linking, multilingual],
},
MewsliX {
name: "Mewsli-X",
description: "Multilingual entity linking across 50 languages. Wikipedia-linked mentions for zero-shot cross-lingual EL.",
url: "https://huggingface.co/datasets/izhx/mewsli-x",
entity_types: ["ENTITY"], language: "mul",
domain: "news",
license: "Apache-2.0",
citation: "Botha et al. (2020)",
paper_url: "https://arxiv.org/abs/2010.11856",
year: 2020,
format: "TSV",
size_hint: "~300k mentions across 50 languages",
notes: "Zero-shot cross-lingual EL benchmark; from WikiNews; Wikipedia KB",
splits: ["test"],
tasks: ["el", "entity_linking", "ned"],
access_status: Public,
categories: [entity_linking, multilingual],
},
BookCorefBamman {
name: "BookCoref (Bamman)",
description: "Full-novel coreference with automatic silver and manual gold annotations. Includes Animal Farm, Siddhartha, Pride and Prejudice.",
url: "https://huggingface.co/datasets/spacemanidol/BookCoref",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "literature",
license: "Research",
citation: "Bamman et al. (2025)",
paper_url: "https://arxiv.org/abs/2507.12075",
year: 2025,
format: "JSONL",
size_hint: "~200k tokens per document",
notes: "Long-document coref benchmark; tests models on full novels; silver + gold annotations. HF-hosted but appears gated in practice; treat as manual unless you have explicit access.",
splits: ["test"],
tasks: ["coref"],
hf_id: "spacemanidol/BookCoref",
access_status: ContactAuthors,
categories: [coref, literary, long_document],
},
NovelCR {
name: "NovelCR",
description: "Large-scale bilingual (EN/ZH) novel coreference. 148k EN mentions, 311k ZH mentions with 74-83% spanning 3+ sentences.",
url: "https://github.com/NovelCR/NovelCR",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "literature",
license: "Research",
citation: "Chen et al. (2024)",
paper_url: "https://openreview.net/forum?id=zuZXwj9aSE",
year: 2024,
format: "JSONL",
size_hint: "EN: 148k mentions, ZH: 311k mentions",
notes: "Long-span coreference; bilingual EN/ZH; most coreferences span multiple sentences",
splits: ["train", "dev", "test"],
tasks: ["coref"],
categories: [coref, literary, long_document, multilingual],
},
AgCNER {
name: "AgCNER",
description: "Large-scale Chinese agricultural NER. 66k samples, ~207k entities, 3.9M characters.",
url: "https://springernature.figshare.com/collections/AgCNER_the_First_Large-Scale_Chinese_Named_Entity_Recognition_Dataset_for_Agricultural_Diseases_and_Pests/6807873",
entity_types: ["CROP", "DISEASE", "PEST", "CHEMICAL", "VARIETY", "LOCATION", "TIME"],
language: "zh",
domain: "scientific",
license: "CC-BY-4.0",
citation: "AgCNER Team (2024)",
paper_url: "https://www.nature.com/articles/s41597-024-03578-5",
year: 2024,
format: "JSONL",
size_hint: "66k samples, ~207k entities, 3.9M characters",
notes: "Nature Scientific Data 2024; 13 entity types; long agricultural case reports; domain NER",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, long_document, multilingual, arcane_domain],
},
ScrollsQMSum {
name: "SCROLLS QMSum",
description: "Long-document QA from SCROLLS benchmark. Query-focused meeting summarization.",
url: "https://github.com/tau-nlp/scrolls",
entity_types: [],
language: "en",
domain: "dialogue",
license: "MIT",
citation: "Shaham et al. (2022)",
paper_url: "https://aclanthology.org/2022.emnlp-main.823/",
year: 2022,
format: "JSONL",
size_hint: "~1.5k meeting transcripts, avg 10k tokens",
notes: "EMNLP 2022; SCROLLS benchmark subset; long meeting transcripts; tests long-context understanding",
splits: ["train", "dev", "test"],
tasks: ["qa"],
categories: [long_document, dialogue],
},
LongDocNER {
name: "Long Document NER",
description: "Long-document NER benchmark. Tests entity recognition across extended contexts.",
url: "https://github.com/xhuang28/LongDocNER",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "mixed",
license: "MIT",
citation: "Huang et al. (2024)",
year: 2024,
format: "JSONL",
size_hint: "~500 documents, avg 8k tokens",
notes: "Tests long-context NER models; entity consistency across document boundaries",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, long_document],
},
BookSumCoref {
name: "BookSum Coref",
description: "Coreference annotations on book chapters from BookSum. Long literary texts.",
url: "https://github.com/salesforce/booksum",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "literature",
license: "Research",
citation: "Kryscinski et al. (2022)",
paper_url: "https://aclanthology.org/2022.findings-emnlp.438/",
year: 2022,
format: "JSONL",
size_hint: "~400 chapters, avg 5k tokens",
notes: "Book chapters with coref chains; tests long-span coreference resolution",
splits: ["train", "test"],
tasks: ["coref"],
categories: [coref, long_document, literary],
},
MultiBioNERLong {
name: "Multi-Bio Long NER",
description: "Long biomedical document NER. Full-text articles vs abstracts.",
url: "https://github.com/dmis-lab/multi-bio-ner",
entity_types: ["GENE", "CHEMICAL", "DISEASE", "SPECIES"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Lee et al. (2023)",
year: 2023,
format: "JSONL",
size_hint: "~1k full-text articles",
notes: "Full-text vs abstract NER comparison; tests biomedical long-context models",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, long_document, biomedical],
},
RadCoref {
name: "RadCoref",
description: "Radiology report coreference from MIMIC-CXR. Clinical domain long-document coref.",
url: "https://physionet.org/content/rad-coreference-resolution/",
entity_types: ["ANATOMY", "OBSERVATION", "FINDING"],
language: "en",
domain: "clinical",
license: "PhysioNet",
citation: "Zhu et al. (2024)",
paper_url: "https://physionet.org/content/rad-coreference-resolution/",
year: 2024,
format: "BRAT",
size_hint: "~500 radiology reports",
notes: "Clinical coref on MIMIC-CXR; requires PhysioNet credentialing; radiology-specific entities",
splits: ["train", "test"],
tasks: ["coref"],
categories: [coref, clinical, biomedical],
},
MEANTIME {
name: "MEANTIME",
description: "Multilingual news corpus with within- and cross-document event coreference. 4 languages.",
url: "https://github.com/newsreader/meantime",
entity_types: ["EVENT", "TIMEX", "PARTICIPANT", "LOCATION"],
language: "mul",
domain: "news",
license: "CC-BY-4.0",
citation: "Minard et al. (2016)",
paper_url: "https://aclanthology.org/L16-1699/",
year: 2016,
format: "Custom",
size_hint: "120 documents, 4 languages (EN, ES, IT, NL)",
notes: "Multilingual CDEC; parallel annotations across languages; NewsReader project",
splits: ["all"],
tasks: ["coref", "event_coref", "cdcr"],
categories: [coref, event_coref, multilingual],
},
FCCT {
name: "FCC-T",
description: "Football Coreference Corpus with token-level annotations. Cross-document event coref in sports news.",
url: "https://github.com/cltl/FCC",
entity_types: ["EVENT", "PARTICIPANT", "TIME", "LOCATION"],
language: "en",
domain: "sports",
license: "CC-BY-4.0",
citation: "Bugert et al. (2021)",
paper_url: "https://direct.mit.edu/coli/article/47/3/575/102774",
year: 2021,
format: "CoNLL",
size_hint: "~300 docs",
notes: "Token-level CDEC; compatible with ECB+ and GVC; sports domain temporal reasoning",
splits: ["train", "dev", "test"],
tasks: ["coref", "event_coref", "cdcr"],
categories: [coref, event_coref],
},
LEMONADE {
name: "LEMONADE",
description: "Large-scale multilingual conflict event corpus. 39k events across 20 languages for CDEC search.",
url: "https://github.com/lemonade-coref/lemonade",
entity_types: ["EVENT", "PARTICIPANT", "LOCATION", "TIME"],
language: "mul",
domain: "news",
license: "Research",
citation: "Eirew et al. (2025)",
year: 2025,
format: "JSONL",
size_hint: "~39k events, 20 languages, 171 countries",
notes: "Conflict event CDEC; cross-document event coreference search task; multilingual",
splits: ["test"],
tasks: ["coref", "event_coref", "cdcr"],
categories: [coref, event_coref, multilingual],
},
BioRED {
name: "BioRED",
description: "Document-level biomedical RE with novelty labels. BioCreative VIII shared task benchmark.",
url: "https://ftp.ncbi.nlm.nih.gov/pub/lu/BioRED/",
entity_types: ["Gene", "Disease", "Chemical", "Species", "Variant", "CellLine"],
language: "en",
domain: "biomedical",
license: "Public",
citation: "Luo et al. (2022)",
paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baae069/7729400",
year: 2022,
format: "Custom",
size_hint: "600 PubMed abstracts, 8 relation types",
notes: "Document-level RE with novelty detection; distinguishes novel vs known relations",
splits: ["train", "dev", "test"],
tasks: ["ner", "re", "relation_extraction"],
categories: [ner, relation_extraction, biomedical],
},
MedMentions {
name: "MedMentions",
description: "Large-scale biomedical concept mentions mapped to UMLS. PubMed abstracts with fine-grained semantic types.",
url: "https://github.com/chanzuckerberg/MedMentions",
entity_types: ["UMLS_CONCEPT"],
language: "en",
domain: "biomedical",
license: "CC0-1.0",
citation: "Mohan & Li (2019)",
paper_url: "https://arxiv.org/abs/1902.09476",
year: 2019,
format: "Custom",
size_hint: "4,392 abstracts, 352k mentions, 35k concepts",
notes: "UMLS concept linking; 127 semantic types; large-scale biomedical concept NER/EL",
splits: ["train", "dev", "test"],
tasks: ["ner", "el", "entity_linking"],
categories: [ner, entity_linking, biomedical],
},
EnzChemRED {
name: "EnzChemRED",
description: "Enzyme chemistry relation extraction. Links enzymes, substrates, products, cofactors from biochemical literature.",
url: "https://github.com/ncbi-nlp/EnzChemRED",
entity_types: ["Enzyme", "Substrate", "Product", "Cofactor", "Reaction"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Schröder et al. (2024)",
paper_url: "https://www.nature.com/articles/s41597-024-03835-7",
year: 2024,
format: "JSONL",
size_hint: "~5k relation triplets",
notes: "Specialized enzyme chemistry RE; biochemical reaction extraction",
splits: ["train", "test"],
tasks: ["ner", "re", "relation_extraction"],
categories: [ner, relation_extraction, biomedical],
},
NCERB {
name: "NCERB",
description: "Named Clinical Entity Recognition Benchmark. Multi-dataset clinical NER evaluation suite.",
url: "https://github.com/NCERB/NCERB",
entity_types: ["Problem", "Treatment", "Test", "Medication", "Anatomy"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Zhou et al. (2024)",
paper_url: "https://arxiv.org/abs/2410.05046",
year: 2024,
format: "Custom",
size_hint: "Multiple clinical corpora aggregated",
notes: "Benchmark suite for clinical NER; evaluates LMs on healthcare entities; aggregates i2b2, n2c2, etc.",
splits: ["test"],
tasks: ["ner"],
categories: [ner, clinical, biomedical],
},
MACCROBAT {
name: "MACCROBAT",
description: "Biomedical NER corpus with extensive coverage. Used with RoBERTa-WWM and deep models.",
url: "https://figshare.com/articles/dataset/MACCROBAT2018/9764942",
entity_types: ["Disease", "Chemical", "Gene", "Species"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Islamaj et al. (2019)",
year: 2019,
format: "Custom",
size_hint: "~400 abstracts",
notes: "Multi-type biomedical NER; chemical and disease mentions",
splits: ["train", "test"],
tasks: ["ner"],
categories: [ner, biomedical],
},
ACE05RE {
name: "ACE 2005 RE",
description: "ACE 2005 relation extraction component. 7 entity types, 6 relation types with subtypes.",
url: "",
entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "VEH", "WEA"],
language: "en",
domain: "news",
license: "LDC",
citation: "Walker et al. (2006)",
year: 2005,
format: "XML",
size_hint: "~600 docs, 7 relation types",
notes: "Classic RE benchmark; requires LDC license; often used with ACE NER",
splits: ["train", "dev", "test"],
tasks: ["ner", "re", "relation_extraction"],
access_status: Registration,
categories: [ner, relation_extraction],
},
CoNLL04RE {
name: "CoNLL04 RE",
description: "Sentence-level relation extraction from CoNLL-2004. Clean, small RE benchmark.",
url: "https://github.com/bekou/multihead_joint_entity_relation_extraction",
entity_types: ["PER", "ORG", "LOC", "Other"],
language: "en",
domain: "news",
license: "Research",
citation: "Roth & Yih (2004)",
paper_url: "https://aclanthology.org/W04-2401/",
year: 2004,
format: "CoNLL",
size_hint: "~1.4k sentences, 5 relation types",
notes: "Clean sentence-level RE; joint NER+RE evaluation",
splits: ["train", "test"],
tasks: ["ner", "re", "relation_extraction"],
categories: [ner, relation_extraction],
},
CrossRE {
name: "CrossRE",
description: "Cross-domain relation extraction across 6 domains. Tests RE generalization.",
url: "https://github.com/mainlp/CrossRE",
entity_types: ["PER", "ORG", "LOC", "MISC"],
language: "en",
domain: "cross_domain",
license: "CC-BY-4.0",
citation: "Bassignana & Plank (2022)",
paper_url: "https://aclanthology.org/2022.emnlp-main.452/",
year: 2022,
format: "JSON",
size_hint: "6 domains: AI, Literature, Music, News, Politics, Science",
notes: "Cross-domain RE evaluation; tests transfer across domains",
splits: ["train", "dev", "test"],
tasks: ["re", "relation_extraction"],
categories: [relation_extraction],
},
UNER {
name: "UNER",
description: "Universal NER on Universal Dependencies. Gold NER with unified schema across 13 languages.",
url: "https://github.com/UniversalNER/UNER",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "general",
license: "CC-BY-SA-4.0",
citation: "Mayhew et al. (2024)",
paper_url: "https://aclanthology.org/2024.naacl-long.243/",
year: 2024,
format: "CoNLLU",
size_hint: "13 languages including Cebuano, Tagalog, Narabizi",
notes: "Unified NER on UD treebanks; includes low-resource languages; community-driven expansion",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, multilingual, low_resource],
},
IndicNER {
name: "IndicNER",
description: "Indian languages NER covering 11 Indian languages. Low-resource multilingual NER.",
url: "https://github.com/AI4Bharat/IndicNER",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "general",
license: "CC-BY-4.0",
citation: "Mhaske et al. (2022)",
paper_url: "https://aclanthology.org/2022.findings-acl.269/",
year: 2022,
format: "CoNLL",
size_hint: "11 languages: Hindi, Bengali, Telugu, Tamil, Marathi, etc.",
notes: "Indian language NER; part of AI4Bharat initiative; low-resource focus",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, multilingual, low_resource],
},
NorNE {
name: "NorNE",
description: "Norwegian NER covering Bokmål and Nynorsk. Morphologically rich language from news and parliament text.",
url: "https://github.com/ltgoslo/norne",
entity_types: ["PER", "LOC", "ORG", "GPE", "PROD", "EVT", "DRV"],
language: "no",
domain: "news",
license: "CC-BY-4.0",
citation: "Jørgensen et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.559/",
year: 2020,
format: "CoNLL",
size_hint: "~600k tokens, both Bokmål and Nynorsk",
notes: "Both Norwegian written forms; morphologically rich; 8 entity types",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner],
},
GermEval2014 {
name: "GermEval 2014",
description: "German NER shared task. Standard German NER benchmark with nested entities.",
url: "https://sites.google.com/site/germaboreval2014/data",
entity_types: ["PER", "LOC", "ORG", "OTH"],
language: "de",
domain: "news",
license: "CC-BY-4.0",
citation: "Benikova et al. (2014)",
paper_url: "https://aclanthology.org/W14-1707/",
year: 2014,
format: "CoNLL",
annotation_scheme: "BIO",
size_hint: "~31k sentences",
notes: "Standard German NER; includes nested/embedded entities; derived from Wikipedia and news",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, nested_ner],
},
ReasoningNER {
name: "ReasoningNER",
description: "Zero-shot NER evaluation suite across 20 diverse datasets. Tests LLM NER capabilities.",
url: "https://github.com/reasoning-ner/reasoning-ner",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "Xia et al. (2025)",
paper_url: "https://arxiv.org/abs/2511.11978",
year: 2025,
format: "JSONL",
size_hint: "20 datasets across news, social, biomedical, etc.",
notes: "Zero-shot NER evaluation; tests instruction-following and entity reasoning in LLMs",
splits: ["test"],
tasks: ["ner"],
categories: [ner, adversarial],
},
BioNERLLaMA {
name: "BioNER-LLaMA",
description: "Instruction-tuned biomedical NER benchmark. Evaluates generative models on disease/chemical/gene NER.",
url: "https://github.com/BIDS-Xu-Lab/BioNER-LLaMA",
entity_types: ["Disease", "Chemical", "Gene"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Keloth et al. (2024)",
paper_url: "https://academic.oup.com/bioinformatics/article/40/4/btae163/7633405",
year: 2024,
format: "JSONL",
size_hint: "Instruction-formatted from BC5CDR, NCBI, etc.",
notes: "LLM instruction-tuning for BioNER; evaluates ChatGPT, LLaMA, etc. on biomedical entities",
splits: ["test"],
tasks: ["ner"],
categories: [ner, biomedical],
},
MentionResolutionLLM {
name: "Mention Resolution LLM",
description: "MCQ-format coreference for LLMs from LitBank and FantasyCoref. Tests referential understanding on narratives.",
url: "https://github.com/mention-resolution/mention-resolution-llm",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "literature",
license: "Research",
citation: "Adams et al. (2024)",
paper_url: "https://arxiv.org/abs/2411.07466",
year: 2024,
format: "JSONL",
size_hint: "MCQ from LitBank + FantasyCoref",
notes: "Multiple-choice coref for LLM evaluation; tests ambiguous, long-distance, nested mentions",
splits: ["test"],
tasks: ["coref"],
categories: [coref, literary],
},
ShARe2013 {
name: "ShARe 2013",
description: "Clinical disorder mentions from ShARe/CLEF eHealth 2013. Discontinuous entity annotations.",
url: "",
entity_types: ["DISORDER"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Pradhan et al. (2013)",
paper_url: "https://aclanthology.org/S13-2056/",
year: 2013,
format: "Custom",
size_hint: "~300 clinical notes",
notes: "Clinical NER with discontinuous spans; shared task at CLEF eHealth",
splits: ["train", "dev", "test"],
tasks: ["ner", "discontinuous-ner"],
access_status: Registration,
categories: [ner, discontinuous_ner, clinical, biomedical],
},
ShARe2014 {
name: "ShARe 2014",
description: "Clinical disorder mentions from ShARe/CLEF eHealth 2014. Improved discontinuous NER annotations.",
url: "",
entity_types: ["DISORDER", "ANATOMY", "MODIFIER"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Mowery et al. (2014)",
paper_url: "https://aclanthology.org/S14-2007/",
year: 2014,
format: "Custom",
size_hint: "~400 clinical notes",
notes: "Improved clinical discontinuous NER; attribute normalization",
splits: ["train", "test"],
tasks: ["ner", "discontinuous-ner"],
access_status: Registration,
categories: [ner, discontinuous_ner, clinical, biomedical],
},
I2B2_2010 {
name: "i2b2 2010",
description: "Clinical concept extraction and assertion classification. Foundational clinical NER benchmark.",
url: "",
entity_types: ["PROBLEM", "TREATMENT", "TEST"],
language: "en",
domain: "clinical",
license: "Research",
citation: "Uzuner et al. (2011)",
paper_url: "https://academic.oup.com/jamia/article/18/5/552/833880",
year: 2010,
format: "Custom",
size_hint: "~871 discharge summaries",
notes: "Foundational clinical NER; requires i2b2/n2c2 data use agreement",
splits: ["train", "test"],
tasks: ["ner"],
access_status: Registration,
categories: [ner, clinical, biomedical],
},
LexGLUENER {
name: "LexGLUE NER",
description: "Legal NER from LexGLUE benchmark. Legal entity extraction from case law and contracts.",
url: "https://github.com/coastalcph/lex-glue",
entity_types: ["PERSON", "ORGANIZATION", "LOCATION", "DATE", "LEGAL_REF", "COURT"],
language: "en",
domain: "legal",
license: "Research",
citation: "Chalkidis et al. (2022)",
paper_url: "https://aclanthology.org/2022.acl-long.297/",
year: 2022,
format: "JSONL",
size_hint: "Part of LexGLUE benchmark suite",
notes: "Legal domain benchmark; includes contracts, case law, legislation",
splits: ["train", "dev", "test"],
tasks: ["ner", "classification"],
categories: [ner, arcane_domain],
},
FinBenNER {
name: "FinBen NER",
description: "Financial NER from FinBen benchmark. Entity extraction from financial documents and filings.",
url: "https://github.com/TheFinAI/FinBen",
entity_types: ["COMPANY", "PERSON", "MONEY", "PERCENT", "DATE", "PRODUCT"],
language: "en",
domain: "financial",
license: "Research",
citation: "Xie et al. (2024)",
paper_url: "https://arxiv.org/abs/2402.12659",
year: 2024,
format: "JSONL",
size_hint: "Multi-task financial benchmark",
notes: "Financial IE benchmark; includes NER, classification, QA; 2024 NeurIPS",
splits: ["test"],
tasks: ["ner"],
categories: [ner, arcane_domain],
},
FiNER139 {
name: "FiNER-139",
description: "Financial NER with 139 fine-grained entity types. SEC 10-K/10-Q filings.",
url: "https://huggingface.co/datasets/nlpaueb/finer-139",
entity_types: ["COMPANY", "EXECUTIVE", "SUBSIDIARY", "PRODUCT", "REGULATION", "FINANCIAL_METRIC"],
language: "en",
domain: "financial",
license: "MIT",
citation: "Shah et al. (2023)",
year: 2023,
format: "JSONL",
size_hint: "~10k sentences, 139 entity types",
notes: "Fine-grained financial NER; hierarchical entity types; SEC filings",
splits: ["train", "test"],
tasks: ["ner"],
hf_id: "nlpaueb/finer-139",
categories: [ner, nested_ner, arcane_domain],
},
TaggedPBCEsperanto {
name: "taggedPBC Esperanto",
description: "POS-tagged Esperanto from Parallel Bible Corpus. ~1800 sentences with word-level alignment.",
url: "https://github.com/clab/taggedPBC",
entity_types: ["PER", "LOC", "ORG"],
language: "eo",
domain: "religious",
license: "CC-BY-4.0",
citation: "Zeman et al. (2025)",
paper_url: "https://arxiv.org/abs/2505.12560",
year: 2024,
format: "CoNLLU",
size_hint: "~1800 sentences, New Testament",
notes: "First large-scale annotated Esperanto corpus; cross-linguistic POS; no dedicated NER layer yet",
splits: ["train"],
tasks: ["ner", "sequence_labeling"],
categories: [ner, constructed, low_resource],
},
TaggedPBCKlingon {
name: "taggedPBC Klingon",
description: "POS-tagged Klingon from Parallel Bible Corpus. OVS word order with complex verbal morphology.",
url: "https://github.com/clab/taggedPBC",
entity_types: ["PER", "LOC", "ORG"],
language: "tlh",
domain: "religious",
license: "CC-BY-4.0",
citation: "Zeman et al. (2025)",
paper_url: "https://arxiv.org/abs/2505.12560",
year: 2024,
format: "CoNLLU",
size_hint: "~1800 sentences, New Testament",
notes: "Klingon has OVS word order, agglutinative verbs with suffix slots; tests non-SVO processing",
splits: ["train"],
tasks: ["ner", "sequence_labeling"],
categories: [ner, constructed, low_resource],
},
UDEsperantoCairo {
name: "UD Esperanto Cairo",
description: "Universal Dependencies treebank for Esperanto. Syntax annotation without NER layer.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Esperanto-Cairo/master/eo_cairo-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "eo",
domain: "constructed_language",
license: "CC-BY-SA-4.0",
citation: "Wennerberg (2020)",
paper_url: "https://universaldependencies.org/eo/index.html",
year: 2020,
format: "CoNLLU",
size_hint: "2 documents (Manifesto, Cairo sample)",
notes: "Small treebank illustrating UD annotation for Esperanto; no NER layer but suitable base for annotation",
splits: ["test"],
tasks: ["ner"],
categories: [ner, constructed, low_resource],
},
KlingonEffectLID {
name: "Klingon Effect LID",
description: "Language ID dataset with 11 constructed languages. 14.2M sentences across 101 languages.",
url: "https://wmdqs.org/submissions-2025/19.pdf",
entity_types: [],
language: "mul",
domain: "general",
license: "Research",
citation: "Moura et al. (2025)",
paper_url: "https://wmdqs.org/submissions-2025/19.pdf",
year: 2025,
format: "Custom",
size_hint: "14.2M sentences, 101 languages (11 constructed)",
notes: "Shows constructed languages (Esperanto, Klingon, Ido, Interlingua) outperform natural languages in LID",
splits: ["test"],
tasks: ["classification"],
categories: [constructed, multilingual, adversarial],
},
LojbanTatoeba {
name: "Lojban Tatoeba",
description: "Lojban-English sentence pairs from Tatoeba. Logical language translation corpus.",
url: "https://tatoeba.org/en/downloads",
entity_types: [],
language: "jbo",
domain: "constructed_language",
license: "CC-BY-2.0",
citation: "Tatoeba Project (2024)",
year: 2024,
format: "TSV",
size_hint: "~3k sentence pairs",
notes: "Logical constructed language; predicate logic syntax; useful for semantic parsing studies",
splits: ["all"],
tasks: ["mt"],
categories: [constructed, low_resource],
},
InterlingueWikipedia {
name: "Interlingue Wikipedia",
description: "Interlingue (Occidental) Wikipedia text corpus. International auxiliary language.",
url: "https://dumps.wikimedia.org/iewiki/",
entity_types: [],
language: "ie",
domain: "encyclopedia",
license: "CC-BY-SA-4.0",
citation: "Wikimedia (2024)",
year: 2024,
format: "XML",
size_hint: "~4k articles",
notes: "Western European vocabulary roots; naturalistic IAL; smaller than Esperanto Wikipedia",
splits: ["all"],
tasks: ["lm"],
categories: [constructed, low_resource],
},
TokiPonaCorpus {
name: "Toki Pona Corpus",
description: "Toki Pona minimalist language corpus. 120-word language for semantic simplification.",
url: "https://github.com/kilipan/toki-pona-corpus",
entity_types: [],
language: "tok",
domain: "constructed_language",
license: "CC0-1.0",
citation: "Lang (2021)",
year: 2021,
format: "TXT",
size_hint: "~50k tokens",
notes: "Philosophical constructed language; only 120 words; tests compositional semantics",
splits: ["all"],
tasks: ["lm"],
categories: [constructed, low_resource],
},
OmniNER2025 {
name: "OmniNER2025",
description: "Diverse fine-grained Chinese NER covering informal text (social media, forums). Large-scale benchmark for modern NER models.",
url: "",
entity_types: ["PER", "LOC", "ORG", "GPE", "FAC", "PRODUCT", "EVENT"],
language: "zh",
domain: "social_media",
license: "Research",
citation: "OmniNER Team (2025)",
paper_url: "https://dl.acm.org/doi/10.1145/3726302.3730048",
year: 2025,
format: "JSONL",
size_hint: "Large-scale Chinese informal text",
notes: "2025 benchmark for fine-grained Chinese NER; expands beyond formal text; tests LLM capabilities",
splits: ["train", "dev", "test"],
tasks: ["ner"],
access_status: NotYetReleased,
categories: [ner, social_media, multilingual],
},
LegalCore {
name: "LegalCore",
description: "Event coreference in long legal documents. Long-distance cross-section event links.",
url: "",
entity_types: ["EVENT", "PARTICIPANT", "TIME"],
language: "en",
domain: "legal",
license: "Research",
citation: "ACL Findings (2025)",
paper_url: "https://aclanthology.org/2025.findings-acl.1284.pdf",
year: 2025,
format: "JSONL",
size_hint: "Long legal documents, largest tokens per document",
notes: "ACL 2025; benchmarks Llama-3.1, Mistral, Qwen, GPT-4; LLMs underperform supervised baselines",
splits: ["train", "dev", "test"],
tasks: ["event_coref", "coref"],
access_status: NotYetReleased,
categories: [coref, event_coref, long_document, arcane_domain],
},
Zcoref {
name: "Z-coref",
description: "Joint coreference and zero-pronoun resolution. For languages with pro-drop (Chinese, Japanese, Korean).",
url: "",
entity_types: ["ZERO_PRONOUN", "ENTITY"],
language: "mul",
domain: "general",
license: "Research",
citation: "Z-coref Authors (2024)",
paper_url: "https://arxiv.org/pdf/2504.05824",
year: 2024,
format: "CoNLL",
size_hint: "Multi-language pro-drop coreference",
notes: "Tests handling of dropped arguments; critical for CJK languages; zero anaphora resolution",
splits: ["train", "dev", "test"],
tasks: ["coref"],
access_status: NotYetReleased,
categories: [coref, multilingual, abstract_anaphora],
},
TikTalkCoref {
name: "TikTalkCoref",
description: "Chinese social media dialogue coreference. Person mentions in Douyin video comments with singleton handling.",
url: "",
entity_types: ["PER"],
language: "zh",
domain: "social_media",
license: "Research",
citation: "Li, Gong & Fu (2025)",
paper_url: "https://arxiv.org/abs/2504.14321",
year: 2025,
format: "Custom",
size_hint: "1,012 dialogues, 2,179 mentions, 1,435 clusters",
notes: "First Chinese MCR dataset for social media. Maverick outperforms e2e-coref (65.5 vs 39.1 Avg.F1). High singleton rate: 44% pronouns, 34% proper names, 22% common nouns. Text-only portion; multimodal aspect out of scope.",
splits: ["train", "dev", "test"],
tasks: ["coref"],
access_status: NotYetReleased,
categories: [coref, dialogue, social_media],
},
MHERCL {
name: "MHERCL",
description: "Historical long-tail entity linking benchmark. Tests LLM behavior on rare/historical Wikidata entities.",
url: "https://arxiv.org/html/2505.03473v1",
entity_types: ["HISTORICAL_ENTITY"],
language: "en",
domain: "historical",
license: "Research",
citation: "MHERCL Authors (2025)",
paper_url: "https://arxiv.org/html/2505.03473v1",
year: 2025,
format: "JSONL",
size_hint: "Long-tail historical entities",
notes: "v0.1; tests EL on niche historical entities; analyzes LLM behavior on rare entities",
splits: ["test"],
tasks: ["el", "entity_linking"],
categories: [entity_linking, historical, adversarial],
},
SNOMEDChallenge {
name: "SNOMED CT EL Challenge",
description: "Clinical entity linking to SNOMED CT. From SNOMED International 2024 challenge.",
url: "",
entity_types: ["CLINICAL_CONCEPT"],
language: "en",
domain: "clinical",
license: "Research",
citation: "SNOMED International (2024)",
paper_url: "https://www.snomed.org/news/snomed-international-announces-entity-linking-challenge-winners",
year: 2024,
format: "Custom",
size_hint: "Clinical notes, SNOMED CT linked",
notes: "2024 challenge dataset; SNOMED CT coded clinical text; benchmarks clinical EL systems",
splits: ["train", "test"],
tasks: ["el", "entity_linking"],
access_status: Registration,
categories: [entity_linking, clinical, biomedical],
},
ESCOSkillsEL {
name: "ESCO Skills EL",
description: "Entity linking for occupational skills to ESCO taxonomy. Job market domain, multilingual.",
url: "",
entity_types: ["SKILL"],
language: "mul",
domain: "general",
license: "Research",
citation: "EACL Findings (2024)",
paper_url: "https://aclanthology.org/2024.findings-eacl.28/",
year: 2024,
format: "Custom",
size_hint: "Skill mentions across multiple languages",
notes: "Complements MELO; links skills (not occupations) to ESCO taxonomy; job posting text",
splits: ["train", "test"],
tasks: ["el", "entity_linking"],
access_status: ContactAuthors,
categories: [entity_linking, multilingual],
},
NatureLMAudio {
name: "NatureLM-audio",
description: "Foundation model training collection for bioacoustics. Multi-species audio-text pairs.",
url: "https://github.com/earthspecies/naturelm-audio",
entity_types: ["SPECIES", "CALL_TYPE", "BEHAVIOR"],
language: "en",
domain: "bioacoustics",
license: "Research",
citation: "NatureLM Team (2024)",
paper_url: "https://arxiv.org/abs/2411.07186",
year: 2024,
format: "Custom",
size_hint: "Multi-taxon audio-text pairs (birds, marine mammals, primates)",
notes: "Bioacoustic foundation model data; paired audio-text descriptions; cross-taxa experiments",
splits: ["train", "test"],
tasks: ["classification", "captioning"],
categories: [arcane_domain, multilingual],
},
BEANSZero {
name: "BEANS-Zero",
description: "Bioacoustics benchmark beyond species classification. Natural-language prompts for animal sounds.",
url: "https://github.com/earthspecies/beans-zero",
entity_types: ["SPECIES", "CALL_TYPE", "INDIVIDUAL"],
language: "en",
domain: "bioacoustics",
license: "Research",
citation: "NatureLM Team (2024)",
paper_url: "https://arxiv.org/abs/2411.07186",
year: 2024,
format: "Custom",
notes: "Zero-shot transfer to unseen taxa; captioning, retrieval, instruction-following on animal vocalizations",
splits: ["test"],
tasks: ["classification", "retrieval"],
categories: [arcane_domain, adversarial],
},
NLMChem {
name: "NLM-Chem",
description: "Chemical entity recognition and normalization. Full-text PMC articles with MeSH identifiers.",
url: "https://ftp.ncbi.nlm.nih.gov/pub/lu/NLM-Chem/",
entity_types: ["CHEMICAL", "DRUG"],
language: "en",
domain: "biomedical",
license: "Public",
citation: "Islamaj et al. (2021)",
paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baac102/6858529",
year: 2021,
format: "BRAT",
size_hint: "~150 full-text articles, ~38k annotations",
notes: "Gold-standard chemical NER; normalized to MeSH; used for BioCreative VII",
splits: ["train", "test"],
tasks: ["ner", "el"],
categories: [ner, biomedical, entity_linking],
},
CHEMDNER {
name: "CHEMDNER",
description: "Chemical compound and drug name recognition in scientific text.",
url: "https://biocreative.bioinformatics.udel.edu/tasks/biocreative-iv/chemdner/",
entity_types: ["CHEMICAL", "DRUG", "ABBREVIATION"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Krallinger et al. (2015)",
paper_url: "https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2",
year: 2015,
format: "BIO",
size_hint: "~10k abstracts",
notes: "BioCreative IV shared task; abstract-level chemical NER; foundational chemistry benchmark",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, biomedical],
},
TimeBankDense {
name: "TimeBank-Dense",
description: "Dense temporal relation annotation. Re-annotation of TimeBank with more consistent TLINK labels.",
url: "https://github.com/bethard/timebank-dense",
entity_types: ["EVENT", "TIMEX3"],
language: "en",
domain: "news",
license: "Research",
citation: "Chambers et al. (2014)",
paper_url: "https://aclanthology.org/Q14-1002/",
year: 2014,
format: "TimeML",
size_hint: "~36 documents, dense annotation",
notes: "Event-event temporal relations; BEFORE/AFTER/INCLUDES/VAGUE; timeline construction benchmark",
splits: ["train", "dev", "test"],
tasks: ["temporal", "event_coref"],
categories: [ner, event_coref],
},
TwitterGMNER {
name: "Twitter-GMNER",
description: "Grounded Multimodal NER. Entities linked to bounding boxes in social media images.",
url: "https://github.com/JinYuanLi0012/RiVEG",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Li et al. (2024)",
paper_url: "https://aclanthology.org/2024.findings-acl.58/",
year: 2024,
format: "JSONL",
size_hint: "~8k tweets with images",
notes: "Entity mentions grounded to image regions; visual-textual entity alignment",
splits: ["train", "dev", "test"],
tasks: ["ner", "grounding"],
categories: [ner, social_media, arcane_domain],
},
MNERMI {
name: "MNER-MI",
description: "Multimodal NER with Multiple Images. Social media posts with multiple image context.",
url: "https://github.com/NUSTM/MNER-MI",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Wang et al. (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.1001/",
year: 2024,
format: "JSONL",
size_hint: "~5k tweets with multiple images",
notes: "Multi-image context improves NER; temporal-prompt model baseline; LREC-COLING 2024",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, social_media],
},
TwoMNER {
name: "2M-NER",
description: "Multilingual Multimodal NER. Four languages with text-image pairs.",
url: "https://github.com/Alibaba-NLP/2M-NER",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "mul",
domain: "social_media",
license: "Apache-2.0",
citation: "Liu et al. (2024)",
paper_url: "https://arxiv.org/abs/2404.17122",
year: 2024,
format: "JSONL",
size_hint: "~20k examples, 4 languages (EN, FR, DE, ES)",
notes: "Contrastive text-image alignment; multilingual multimodal NER benchmark",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, multilingual, social_media],
},
MathEntities {
name: "Mathematical Entities",
description: "Terminology and definition extraction from mathematical text. Category theory corpora.",
url: "https://github.com/dmazzei/mathematical-entities",
entity_types: ["TERM", "DEFINITION", "THEOREM"],
language: "en",
domain: "scientific",
license: "CC-BY-4.0",
citation: "Mazzei et al. (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.966/",
year: 2024,
format: "LaTeX",
size_hint: "~3 corpora in category theory",
notes: "LaTeX source preservation; math-aware NER; entity linking to Wikidata/nLab",
splits: ["train", "test"],
tasks: ["ner", "el"],
categories: [ner, arcane_domain, entity_linking],
},
SciERC {
name: "SciERC",
description: "Scientific information extraction from AI/ML papers. Nested entities and relations.",
url: "https://nlp.cs.washington.edu/sciIE/",
entity_types: ["TASK", "METHOD", "METRIC", "MATERIAL", "GENERIC", "OTHER"],
language: "en",
domain: "scientific",
license: "CC-BY-4.0",
citation: "Luan et al. (2018)",
paper_url: "https://aclanthology.org/D18-1360/",
year: 2018,
format: "JSONL",
size_hint: "~500 abstracts",
notes: "Canonical scientific NER + relation extraction; nested entities common",
splits: ["train", "dev", "test"],
tasks: ["ner", "re"],
categories: [ner, nested_ner, relation_extraction, arcane_domain],
},
GeoWebNews {
name: "GeoWebNews",
description: "Geoparsing benchmark from web news. Toponyms with geocoding coordinates.",
url: "https://github.com/milangritta/GeoWebNews",
entity_types: ["LOC", "GPE", "FACILITY"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "Gritta et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.381/",
year: 2020,
format: "CoNLL",
size_hint: "~4k documents",
notes: "Toponym recognition + resolution; GeoNames linking; web news geoparsing",
splits: ["train", "dev", "test"],
tasks: ["ner", "el"],
categories: [ner, entity_linking],
},
LGL {
name: "LGL",
description: "Local-Global Lexicon for toponym disambiguation. News articles with geolocation.",
url: "https://github.com/wikipedia2vec/wikipedia2vec",
entity_types: ["LOC"],
language: "en",
domain: "news",
license: "MIT",
citation: "Lieberman et al. (2010)",
year: 2010,
format: "Custom",
size_hint: "~5.8k place references",
notes: "Toponym disambiguation benchmark; local vs global context for geolocation",
splits: ["all"],
tasks: ["ner", "el"],
categories: [ner, entity_linking],
},
TASTEset {
name: "TASTEset",
description: "Recipe ingredient NER. 700 annotated recipe ingredient lists with 9 entity classes.",
url: "https://github.com/taisti/TASTEset",
entity_types: ["INGREDIENT", "QUANTITY", "UNIT", "STATE", "SIZE", "TEMP"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "TASTEset Team (2023)",
year: 2023,
format: "BIO",
size_hint: "~700 ingredient lists",
notes: "Recipe NER benchmark; BIO/BILOU conversion utilities; BERT model pipeline",
splits: ["train", "test"],
tasks: ["ner"],
categories: [ner, arcane_domain],
},
RecipeNER {
name: "Recipe NER",
description: "Deep learning recipe NER. Multi-scale datasets with ingredient and instruction entities.",
url: "https://github.com/cosylabiiit/recipe-ner",
entity_types: ["INGREDIENT", "QUANTITY", "UNIT", "PROCESS", "UTENSIL", "TEMP"],
language: "en",
domain: "food",
license: "MIT",
citation: "Deepgram (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.406/",
year: 2024,
format: "BIO",
size_hint: "~88k phrases (6.6k manual, 26k augmented, 88k machine)",
notes: "Three-tier dataset; baseline pipelines exist (e.g., spaCy transformer); recipe IE pipeline",
splits: ["train", "test"],
tasks: ["ner"],
categories: [ner, arcane_domain],
},
CodeSearchNet {
name: "CodeSearchNet",
description: "Code understanding benchmark. Function documentation and code search across 6 languages.",
url: "https://github.com/github/CodeSearchNet",
entity_types: ["FUNCTION", "CLASS", "VARIABLE", "MODULE"],
language: "mul",
domain: "code",
license: "MIT",
citation: "Husain et al. (2019)",
paper_url: "https://arxiv.org/abs/1909.09436",
year: 2019,
format: "JSONL",
size_hint: "~2M functions across 6 programming languages",
notes: "Code-docstring pairs; Python, Java, Go, PHP, JavaScript, Ruby; foundation for code NER",
splits: ["train", "dev", "test"],
tasks: ["retrieval"],
categories: [arcane_domain, multilingual],
},
FABLE {
name: "FABLE",
description: "Fiction Adapted BERT for Literary Entities. DeBERTa-based NER for narrative fiction.",
url: "https://huggingface.co/DeBERTa-literary-entities",
entity_types: ["CHARACTER", "LOCATION", "ORGANIZATION", "ARTIFACT"],
language: "en",
domain: "fiction",
license: "MIT",
citation: "FABLE Team (2024)",
year: 2024,
format: "Custom",
notes: "Literary NER model; targets invented names in fantasy/SF; trained on narrative fiction",
categories: [ner, literary],
},
ELGold {
name: "ELGold",
description: "Gold-standard multi-genre Polish NER+EL. Includes fiction, press, blogs.",
url: "https://mostwiedzy.pl/en/open-research-data/elgold-gold-standard-multi-genre-dataset",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "pl",
domain: "general",
license: "CC-BY-4.0",
citation: "Pokrywka et al. (2025)",
paper_url: "https://www.nature.com/articles/s41597-025-05274-4",
year: 2025,
format: "JSONL",
notes: "Multi-genre including fiction; Wikipedia-linked; Polish language",
splits: ["train", "test"],
tasks: ["ner", "el"],
categories: [ner, entity_linking, literary, multilingual],
},
StreamingCDCoref {
name: "Streaming CD-Coref",
description: "Streaming cross-document entity coreference protocol. News domain streaming evaluation.",
url: "https://www.cs.jhu.edu/~mdredze/publications/streaming_coref_coling.pdf",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "news",
license: "Research",
citation: "Dredze et al. (2010)",
paper_url: "https://aclanthology.org/C10-1032/",
year: 2010,
format: "Custom",
notes: "Canonical streaming entity clustering; O(n) single-pass; evolving cluster representations",
categories: [coref, long_document],
},
TemDocRED {
name: "Tem-DocRED",
description: "Temporal document-level relation extraction. Converts static triples to temporal quadruples.",
url: "https://github.com/THUDM/Tem-DocRED",
entity_types: ["PER", "ORG", "LOC", "TIME"],
language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Zhang et al. (2024)",
paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC12048500/",
year: 2024,
format: "JSONL",
size_hint: "Re-DocRED + temporal timestamps",
notes: "Temporal KG construction from documents; LLM + pattern mining for timestamp inference",
splits: ["train", "dev", "test"],
tasks: ["re", "temporal"],
categories: [relation_extraction, long_document],
},
SciCoRadar {
name: "SciCo-Radar",
description: "Scientific cross-document concept coreference. Dynamic definitions via LLM retrieval.",
url: "https://github.com/allenai/scico-radar",
entity_types: ["CONCEPT", "METHOD", "TASK", "MATERIAL"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Wadden et al. (2024)",
paper_url: "https://arxiv.org/abs/2409.15113",
year: 2024,
format: "JSONL",
notes: "Cross-doc concept coref with hierarchy; LLM-generated relational definitions improve F1",
splits: ["train", "dev", "test"],
tasks: ["coref"],
categories: [coref, arcane_domain],
},
EventKGDrift {
name: "Event KG Drift",
description: "Multi-perspective concept drift detection on event knowledge graphs.",
url: "https://research.tue.nl/files/349781334/978-3-031-61057-8_9.pdf",
entity_types: ["EVENT", "CASE", "ACTOR", "TIME"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "TU Eindhoven (2024)",
year: 2024,
format: "Custom",
notes: "Actor-centric features give 2.6x stronger drift signals; temporal graph drift on EKGs",
categories: [event_coref, long_document, arcane_domain],
},
WikidataDrift {
name: "Wikidata Semantic Drift",
description: "Semantic drift detection in Wikidata. LLM-based classification inconsistency detection.",
url: "https://arxiv.org/abs/2511.04926",
entity_types: [],
language: "mul",
domain: "encyclopedia",
license: "CC0-1.0",
citation: "Wikidata Drift Team (2024)",
paper_url: "https://arxiv.org/abs/2511.04926",
year: 2024,
format: "Custom",
notes: "Multi-dimensional semantic risk model; drift threshold ~0.6; continuous KG curation",
categories: [entity_linking, adversarial],
},
AIDA {
name: "AIDA-CoNLL (v2)",
description: "Entity linking to Wikipedia. CoNLL-YAGO dataset for named entity disambiguation.",
url: "https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "news",
license: "Research",
citation: "Hoffart et al. (2011)",
paper_url: "https://aclanthology.org/D11-1072/",
year: 2011,
format: "CoNLL",
notes: "Entity linking benchmark; links CoNLL-2003 mentions to YAGO/Wikipedia",
categories: [entity_linking],
},
AIONER {
name: "AIONER",
description: "All-in-one biomedical NER. Unified biomedical entity extraction model.",
url: "https://github.com/ncbi/AIONER",
entity_types: ["Gene", "Disease", "Chemical", "Species"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Luo et al. (2023)",
year: 2023,
format: "JSONL",
notes: "Unified model for multiple biomedical entity types",
categories: [ner, biomedical],
},
AISHELLNER {
name: "AISHELL-NER",
description: "Chinese speech NER from AISHELL corpus. Named entities in Mandarin speech.",
url: "https://www.aishelltech.com/aishell_2",
entity_types: ["PER", "LOC", "ORG"],
language: "zh",
domain: "speech",
license: "Research",
citation: "AISHELL Foundation (2017)",
year: 2017,
format: "Custom",
notes: "Speech transcription NER; tests robustness to ASR errors",
categories: [ner, speech],
},
AstroNER {
name: "AstroNER",
description: "Astronomy named entity recognition. Celestial objects and astronomical concepts.",
url: "https://github.com/astronomical-ner/AstroNER",
entity_types: ["CelestialObject", "Instrument", "Mission", "Phenomenon"],
language: "en",
domain: "astrophysics",
license: "CC-BY-4.0",
citation: "NASA ADS Team",
year: 2022,
format: "CoNLL",
notes: "Domain-specific NER for astronomy literature",
categories: [ner, arcane_domain],
},
B2NERD {
name: "B2NERD",
description: "Billion-scale news NER dataset. Large-scale distantly supervised NER.",
url: "https://huggingface.co/datasets/Umean/B2NERD",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "Umean (2023)",
year: 2023,
format: "JSONL",
notes: "Large-scale silver-standard NER; useful for pre-training",
categories: [ner],
},
BioMNER {
name: "BioMNER",
description: "Biomedical method NER. Scientific methods and techniques in biomedical text.",
url: "https://huggingface.co/datasets/tner/bionlp2004",
entity_types: ["Method", "Technique", "Protocol"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "BioNLP (2004)",
year: 2004,
format: "BIO",
notes: "Biomedical methodology extraction; from BioNLP shared task",
categories: [ner, biomedical],
},
LegNER {
name: "LegNER",
description: "Legal domain NER. Named entities in legal documents and court opinions.",
url: "https://github.com/Liquid-Legal-Institute/LegalBench",
entity_types: ["Court", "Judge", "Statute", "Party", "Date"],
language: "en",
domain: "legal",
license: "CC-BY-4.0",
citation: "Legal NLP Team (2021)",
year: 2021,
format: "CoNLL",
notes: "Legal domain specialization; court documents and statutes",
categories: [ner],
},
OpenNER {
name: "OpenNER 1.0",
description: "Open domain NER benchmark. Broad coverage across multiple domains.",
url: "https://huggingface.co/datasets/yongsun-yoon/open-ner-english",
entity_types: ["PER", "LOC", "ORG", "EVENT", "PRODUCT"],
language: "en",
domain: "mixed",
license: "CC-BY-SA-4.0",
citation: "Babelscape (2023)",
year: 2023,
format: "JSONL",
notes: "Community mirror; open-domain NER benchmark",
tasks: ["ner"],
hf_id: "yongsun-yoon/open-ner-english",
access_status: Public,
categories: [ner],
},
SciNER {
name: "SciNER",
description: "Scientific literature NER. Entities from scientific papers across disciplines.",
url: "https://github.com/allenai/sciner",
entity_types: ["Method", "Task", "Dataset", "Metric", "Material"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Allen AI (2022)",
year: 2022,
format: "JSONL",
notes: "Scientific entities; paper abstracts and methods sections",
categories: [ner],
},
FinanceNER {
name: "FinanceNER",
description: "Financial domain NER. Named entities from financial documents and news.",
url: "https://github.com/nlpaueb/finer",
entity_types: ["Company", "Stock", "Currency", "Amount", "Date"],
language: "en",
domain: "financial",
license: "Research",
citation: "FinNLP (2020)",
year: 2020,
format: "CoNLL",
notes: "Financial entity extraction; SEC filings and news",
categories: [ner],
},
TechNER {
name: "TechNER",
description: "Technology domain NER. Software, hardware, and technical entities.",
url: "https://github.com/techner/techner",
entity_types: ["Software", "Hardware", "Company", "Version", "Language"],
language: "en",
domain: "code",
license: "MIT",
citation: "TechNER Team (2021)",
year: 2021,
format: "CoNLL",
notes: "Technology entities; Stack Overflow and documentation",
categories: [ner],
},
FictionNER750M {
name: "FictionNER-750M",
description: "Fiction NER at scale. Named entities from 750M tokens of fiction text.",
url: "https://huggingface.co/datasets/SaladTechnologies/fiction-ner-750m",
entity_types: ["Character", "Location", "Object", "Organization"],
language: "en",
domain: "fiction",
license: "CC-BY-4.0",
citation: "Fiction NER Team (2023)",
year: 2023,
format: "JSONL",
notes: "Large-scale fiction NER; public on HuggingFace",
tasks: ["ner"],
hf_id: "SaladTechnologies/fiction-ner-750m",
access_status: Public,
categories: [ner, literary],
},
CharacterCodex {
name: "Character Codex",
description: "Character entity recognition in fiction. Literary character identification.",
url: "https://github.com/character-codex/character-codex",
entity_types: ["Character", "Alias", "Role"],
language: "en",
domain: "fiction",
license: "CC-BY-4.0",
citation: "Character Codex Team (2022)",
year: 2022,
format: "JSONL",
notes: "Character tracking across narrative; aliases and roles",
categories: [ner, literary],
},
MUC6 {
name: "MUC-6",
description: "Message Understanding Conference 6. Seminal NER and coreference dataset.",
url: "https://catalog.ldc.upenn.edu/LDC2003T13",
entity_types: ["ENAMEX", "TIMEX", "NUMEX"],
language: "en",
domain: "news",
license: "LDC",
citation: "Grishman & Sundheim (1996)",
paper_url: "https://aclanthology.org/C96-1079/",
year: 1996,
format: "SGML",
notes: "Historically significant; established NER evaluation paradigm",
categories: [ner, historical],
},
MUC7 {
name: "MUC-7",
description: "Message Understanding Conference 7. Expanded NE types from MUC-6.",
url: "https://catalog.ldc.upenn.edu/LDC2001T02",
entity_types: ["ENAMEX", "TIMEX", "NUMEX"],
language: "en",
domain: "news",
license: "LDC",
citation: "Chinchor (1998)",
paper_url: "https://aclanthology.org/M98-1002/",
year: 1998,
format: "SGML",
notes: "Refined MUC-6 guidelines; includes satellite launch texts",
categories: [ner, historical],
},
OntoNotes50 {
name: "OntoNotes 5.0",
description: "OntoNotes Release 5.0. Multi-genre corpus with NER, coref, and more.",
url: "https://catalog.ldc.upenn.edu/LDC2013T19",
entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "NORP", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"],
language: "en",
domain: "mixed",
license: "LDC",
citation: "Weischedel et al. (2013)",
paper_url: "https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf",
year: 2013,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "~2.9M words across genres",
notes: "Gold standard for multiple NLP tasks; WSJ, broadcast, web, telephone",
categories: [ner, coref],
},
GUM {
name: "GUM",
description: "Georgetown University Multilayer corpus. Rich annotation across 12 genres.",
url: "https://github.com/amir-zeldes/gum",
entity_types: ["person", "place", "organization", "time", "event"],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Zeldes (2017)",
paper_url: "https://aclanthology.org/W17-0809/",
year: 2017,
format: "CoNLL",
size_hint: "~200k tokens, 12 genres",
notes: "Multi-layer annotation; coreference, RST, entities",
categories: [ner, coref],
},
TACKBP {
name: "TAC-KBP",
description: "TAC Knowledge Base Population. Entity linking and slot filling benchmark.",
url: "https://tac.nist.gov/",
entity_types: ["PER", "ORG", "GPE"],
language: "en",
domain: "news",
license: "LDC",
citation: "Ji et al. (2010)",
paper_url: "https://aclanthology.org/C10-1058/",
year: 2010,
format: "Custom",
notes: "Entity linking to Wikipedia/KB; slot filling for attributes",
categories: [entity_linking],
},
HAREM {
name: "HAREM",
description: "Portuguese NER evaluation. First and Second HAREM conferences.",
url: "https://www.linguateca.pt/HAREM/",
entity_types: ["PESSOA", "LOCAL", "ORGANIZACAO", "TEMPO", "VALOR"],
language: "pt",
domain: "news",
license: "Research",
citation: "Santos et al. (2006)",
paper_url: "https://www.linguateca.pt/HAREM/",
year: 2006,
format: "SGML",
notes: "Portuguese NER benchmark; morphologically rich language",
categories: [ner, multilingual],
},
GunViolenceCorpus {
name: "Gun Violence Corpus (v2)",
description: "Gun violence event extraction. Named entities and events from news.",
url: "https://github.com/gun-violence-corpus/gvc",
entity_types: ["Shooter", "Victim", "Weapon", "Location", "Date"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "Pavlick et al. (2016)",
year: 2016,
format: "Custom",
notes: "Event extraction; sensitive domain requiring careful handling",
categories: [ner, event_coref],
},
MAVEN {
name: "MAVEN",
description: "Massive general-domain event detection. 168 event types from Wikipedia, 4x larger than ACE.",
url: "https://github.com/THU-KEG/MAVEN-dataset",
entity_types: ["EVENT_TRIGGER"], language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Wang et al. (2020)",
paper_url: "https://aclanthology.org/2020.emnlp-main.129/",
year: 2020,
format: "JSONL",
annotation_scheme: "Trigger-based",
size_hint: "~118k trigger instances, 4,480 documents, 168 event types",
notes: "EMNLP 2020; largest general-domain ED dataset; CodaLab leaderboard available; Tsinghua Cloud/Google Drive download",
splits: ["train", "valid", "test"],
tasks: ["event_extraction"],
access_status: Public,
categories: [ner], },
MAVENArg {
name: "MAVEN-ARG",
description: "MAVEN extended with event arguments and relations. Complete event extraction benchmark.",
url: "https://github.com/THU-KEG/MAVEN-Argument",
entity_types: ["EVENT_TRIGGER", "EVENT_ARGUMENT", "EVENT_RELATION"],
language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Wang et al. (2024)",
paper_url: "https://aclanthology.org/2024.acl-long.224/",
year: 2024,
format: "JSONL",
annotation_scheme: "Trigger-Argument",
size_hint: "~98k argument annotations, ~21k relations",
notes: "ACL 2024; builds on MAVEN; supports ED + EAE + ERE tasks; all-in-one event understanding",
splits: ["train", "valid", "test"],
tasks: ["event_extraction", "relation_extraction"],
access_status: Public,
categories: [ner, relation_extraction],
},
CASIE {
name: "CASIE",
description: "Cybersecurity event extraction. Attack patterns, vulnerabilities, malware events.",
url: "https://github.com/Ebiquity/CASIE",
entity_types: ["Attack-Pattern", "Vulnerability", "Data-Breach", "Malware", "Patch"],
language: "en",
domain: "cybersecurity",
license: "CC-BY-4.0",
citation: "Satyapanich et al. (2020)",
paper_url: "https://aclanthology.org/2020.case-1.12/",
year: 2020,
format: "Standoff",
size_hint: "~1k documents, 5 event types, 26 argument roles",
notes: "Domain-specific event extraction; cybersecurity news articles",
splits: ["train", "dev", "test"],
tasks: ["event_extraction", "ner"],
access_status: Public,
categories: [ner, arcane_domain],
},
RAMS {
name: "RAMS",
description: "Roles Across Multiple Sentences. Cross-sentence event argument extraction with 139 event types.",
url: "https://nlp.jhu.edu/rams/",
entity_types: ["EVENT_TRIGGER", "EVENT_ARGUMENT"],
language: "en",
domain: "news",
license: "Research",
citation: "Ebner et al. (2020)",
paper_url: "https://aclanthology.org/2020.acl-main.718/",
year: 2020,
format: "JSONL",
size_hint: "~9,124 event instances, 139 event types",
notes: "ACL 2020; tests implicit/cross-sentence arguments; requires multi-sentence reasoning",
splits: ["train", "dev", "test"],
tasks: ["event_extraction"],
access_status: Public,
categories: [ner, long_document],
},
SLUE {
name: "SLUE",
description: "Spoken Language Understanding Evaluation. NER in speech transcripts.",
url: "https://github.com/asappresearch/slue-toolkit",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "en",
domain: "speech",
license: "MIT",
citation: "Shon et al. (2022)",
paper_url: "https://aclanthology.org/2022.naacl-main.137/",
year: 2022,
format: "JSONL",
notes: "End-to-end speech NER; VoxPopuli and VoxCeleb sources",
categories: [ner, speech],
},
CRAFTCoref {
name: "CRAFT Coreference",
description: "Colorado Richly Annotated Full-Text corpus coreference. Biomedical coref.",
url: "https://github.com/UCDenver-ccp/CRAFT",
entity_types: ["Gene", "Protein", "Cell", "Organism"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Cohen et al. (2017)",
paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/bax087/4621360",
year: 2017,
format: "Standoff",
notes: "Full-text biomedical articles; coreference including bridging",
categories: [coref, biomedical],
},
FootballCorefCorpus {
name: "Football Coreference Corpus (v2)",
description: "Cross-document event coreference for football matches.",
url: "https://github.com/cltl/FCC",
entity_types: ["Event", "Team", "Player", "Location"],
language: "en",
domain: "sports",
license: "CC-BY-4.0",
citation: "Vossen et al. (2018)",
year: 2018,
format: "Custom",
notes: "Cross-document event coreference; sports domain",
categories: [event_coref],
},
MultipartyDialogueCoref {
name: "Multiparty Dialogue Coreference",
description: "Coreference in multi-party conversations. Meeting and chat transcripts.",
url: "https://github.com/sopan-sarkar/multiparty-dialogue-coref",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "dialogue",
license: "CC-BY-4.0",
citation: "Sarkar et al. (2020)",
year: 2020,
format: "JSONL",
notes: "Multi-party setting; speaker identification challenges",
categories: [coref, dialogue],
},
CODICRAC {
name: "CODI-CRAC",
description: "CODI/CRAC shared task on anaphora and coreference. Multiple languages.",
url: "https://github.com/UniversalAnaphora/UA-CODI-CRAC",
entity_types: ["PER", "ORG", "LOC", "Event"],
language: "mul",
domain: "mixed",
license: "CC-BY-4.0",
citation: "CODI-CRAC Team (2022)",
paper_url: "https://aclanthology.org/2022.codi-1.0/",
year: 2022,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
notes: "Shared task data; includes bridging and discourse deixis",
categories: [coref, multilingual],
},
MixRED {
name: "MixRED",
description: "Mixed relation extraction dataset. Multiple relation types and domains.",
url: "https://github.com/mixred/MixRED",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "MixRED Team (2022)",
year: 2022,
format: "JSONL",
notes: "Relation extraction across multiple domains",
categories: [relation_extraction],
},
CovEReD {
name: "CovEReD",
description: "COVID-19 relation extraction dataset. Biomedical relations from pandemic literature.",
url: "https://github.com/covered/CovEReD",
entity_types: ["Drug", "Disease", "Gene", "Symptom"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "CovEReD Team (2021)",
year: 2021,
format: "JSONL",
notes: "COVID-19 specific; drug-disease-gene relations",
categories: [relation_extraction, biomedical],
},
SciER {
name: "SciER",
description: "Scientific entity and relation extraction. From AI/ML papers.",
url: "https://github.com/allenai/sciie",
entity_types: ["Task", "Method", "Metric", "Material", "Generic"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Luan et al. (2018)",
paper_url: "https://aclanthology.org/D18-1360/",
year: 2018,
format: "JSONL",
notes: "Scientific IE; paper abstracts with nested entities",
categories: [ner, relation_extraction, nested_ner],
},
WEBNLG {
name: "WebNLG",
description: "Web NLG Challenge dataset. RDF-to-text generation with entity-relation triples.",
url: "https://gitlab.com/webnlg/challenge-2017",
entity_types: ["Entity"],
language: "en",
domain: "wikipedia",
license: "CC-BY-4.0",
citation: "Gardent et al. (2017)",
paper_url: "https://aclanthology.org/W17-3518/",
year: 2017,
format: "XML",
notes: "RDF triples to natural language; 15 DBpedia categories",
categories: [relation_extraction],
},
AkkadianUD {
name: "Akkadian UD",
description: "Universal Dependencies for Akkadian. Cuneiform texts from ancient Mesopotamia.",
url: "https://universaldependencies.org/treebanks/akk_pisandub/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "akk",
domain: "historical",
license: "CC-BY-SA-4.0",
citation: "UD Akkadian Team",
year: 2020,
format: "CoNLLU",
notes: "Cuneiform script; extinct Semitic language",
categories: [historical, ancient],
},
AncientHebrewUD {
name: "Ancient Hebrew UD",
description: "Universal Dependencies for Biblical Hebrew. Hebrew Bible text.",
url: "https://universaldependencies.org/treebanks/hbo_ptnk/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "hbo",
domain: "religious",
license: "CC-BY-SA-4.0",
citation: "UD Hebrew Team",
year: 2019,
format: "CoNLLU",
notes: "Biblical Hebrew; Torah and Prophets",
categories: [historical, ancient],
},
ClassicalChineseUD {
name: "Classical Chinese UD",
description: "Universal Dependencies for Classical/Literary Chinese. Pre-modern texts.",
url: "https://universaldependencies.org/treebanks/lzh_kyoto/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "lzh",
domain: "historical",
license: "CC-BY-SA-4.0",
citation: "UD Classical Chinese Team",
year: 2018,
format: "CoNLLU",
notes: "Literary Chinese; classical texts and commentaries",
categories: [historical, ancient],
},
CopticUD {
name: "Coptic UD",
description: "Universal Dependencies for Coptic. Late Egyptian language.",
url: "https://universaldependencies.org/treebanks/cop_scriptorium/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "cop",
domain: "religious",
license: "CC-BY-SA-4.0",
citation: "Zeldes & Schroeder (2016)",
year: 2016,
format: "CoNLLU",
notes: "Coptic; Gnostic and Biblical texts",
categories: [historical, ancient],
},
GothicUD {
name: "Gothic UD",
description: "Universal Dependencies for Gothic. Wulfila's Bible translation.",
url: "https://universaldependencies.org/treebanks/got_proiel/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "got",
domain: "religious",
license: "CC-BY-NC-SA-4.0",
citation: "PROIEL Team",
year: 2014,
format: "CoNLLU",
notes: "Gothic; oldest substantial Germanic text",
categories: [historical, ancient],
},
HittiteUD {
name: "Hittite UD",
description: "Universal Dependencies for Hittite. Ancient Anatolian language.",
url: "https://universaldependencies.org/treebanks/hit_hittb/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "hit",
domain: "historical",
license: "CC-BY-SA-4.0",
citation: "UD Hittite Team",
year: 2021,
format: "CoNLLU",
notes: "Cuneiform Hittite; Bronze Age Anatolia",
categories: [historical, ancient],
},
OldChurchSlavonicUD {
name: "Old Church Slavonic UD",
description: "Universal Dependencies for OCS. Medieval Slavic liturgical language.",
url: "https://universaldependencies.org/treebanks/cu_proiel/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "cu",
domain: "religious",
license: "CC-BY-NC-SA-4.0",
citation: "PROIEL Team",
year: 2014,
format: "CoNLLU",
notes: "Oldest Slavic literary language; Cyrillic/Glagolitic",
categories: [historical, ancient],
},
LatinITTB {
name: "Latin ITTB",
description: "Index Thomisticus Treebank. Medieval Latin theological texts.",
url: "https://universaldependencies.org/treebanks/la_ittb/index.html",
entity_types: ["PER", "LOC", "ORG"],
language: "la",
domain: "religious",
license: "CC-BY-NC-SA-3.0",
citation: "McGillivray et al. (2009)",
year: 2009,
format: "CoNLLU",
notes: "Aquinas texts; medieval scholastic Latin",
categories: [historical],
},
LatinPROIEL {
name: "Latin PROIEL",
description: "Pragmatic Resources in Old Indo-European Languages. Classical Latin.",
url: "https://universaldependencies.org/treebanks/la_proiel/index.html",
entity_types: ["PER", "LOC", "GPE"],
language: "la",
domain: "historical",
license: "CC-BY-NC-SA-4.0",
citation: "PROIEL Team",
year: 2014,
format: "CoNLLU",
notes: "Vulgate, Caesar, Cicero; classical and late Latin",
categories: [historical],
},
EsperantoUD {
name: "Esperanto UD",
description: "Universal Dependencies for Esperanto. Planned international language.",
url: "https://universaldependencies.org/treebanks/eo_pud/index.html",
entity_types: ["PER", "LOC", "ORG"],
language: "eo",
domain: "constructed_language",
license: "CC-BY-SA-4.0",
citation: "UD Esperanto Team",
year: 2017,
format: "CoNLLU",
notes: "Constructed language; regular agglutinative morphology",
categories: [constructed],
},
Dothraki {
name: "Dothraki",
description: "Dothraki language corpus. Game of Thrones constructed language.",
url: "https://wiki.dothraki.org/",
entity_types: ["PER", "LOC"],
language: "dlk",
domain: "fiction",
license: "CC-BY-SA-4.0",
citation: "Peterson (2011)",
year: 2011,
format: "Custom",
notes: "Conlang by David Peterson; SVO word order",
categories: [constructed],
},
HighValyrian {
name: "High Valyrian",
description: "High Valyrian corpus. Game of Thrones constructed language.",
url: "https://wiki.dothraki.org/High_Valyrian",
entity_types: ["PER", "LOC"],
language: "hvy",
domain: "fiction",
license: "CC-BY-SA-4.0",
citation: "Peterson (2013)",
year: 2013,
format: "Custom",
notes: "Highly inflected conlang; 4 genders, 8 cases",
categories: [constructed],
},
Klingon {
name: "Klingon",
description: "Klingon language corpus. Star Trek constructed language.",
url: "https://github.com/klingonlanguage/klingon-data",
entity_types: ["PER", "LOC", "ORG"],
language: "tlh",
domain: "fiction",
license: "Research",
citation: "Okrand (1985)",
year: 1985,
format: "Custom",
notes: "OVS word order; unique phonology; active community",
categories: [constructed],
},
Quenya {
name: "Quenya",
description: "Quenya language corpus. Tolkien's Elvish language.",
url: "https://eldamo.org/",
entity_types: ["PER", "LOC"],
language: "qya",
domain: "fiction",
license: "CC-BY-4.0",
citation: "Tolkien (1954)",
year: 1954,
format: "Custom",
notes: "Finnish-inspired phonology; Tengwar script",
categories: [constructed],
},
Navi {
name: "Na'vi",
description: "Na'vi language corpus. Avatar constructed language.",
url: "https://learnnavi.org/",
entity_types: ["PER", "LOC"],
language: "nav",
domain: "fiction",
license: "Research",
citation: "Frommer (2009)",
year: 2009,
format: "Custom",
notes: "Free word order; ejectives; infixes",
categories: [constructed],
},
InterslavicCorpus {
name: "Interslavic",
description: "Interslavic zonal auxiliary language. Constructed for Slavic intelligibility.",
url: "https://interslavic.fun/",
entity_types: ["PER", "LOC", "ORG"],
language: "isv",
domain: "constructed_language",
license: "CC-BY-SA-4.0",
citation: "Interslavic Team (2006)",
year: 2006,
format: "Custom",
notes: "Maximizes mutual intelligibility across Slavic languages",
categories: [constructed],
},
Lojban {
name: "Lojban",
description: "Lojban logical language corpus. Constructed for unambiguous communication.",
url: "https://mw.lojban.org/",
entity_types: [],
language: "jbo",
domain: "constructed_language",
license: "Public Domain",
citation: "Cowan (1997)",
year: 1997,
format: "Custom",
notes: "Predicate logic-based; completely unambiguous grammar",
categories: [constructed],
},
TokiPona {
name: "Toki Pona",
description: "Toki Pona minimalist language corpus. 120-word philosophical language.",
url: "https://github.com/kilipan/toki-pona-corpus",
entity_types: [],
language: "tok",
domain: "constructed_language",
license: "CC-BY-SA-4.0",
citation: "Lang (2001)",
year: 2001,
format: "Custom",
notes: "Minimalist; tests compositional semantics",
categories: [constructed],
},
I2B22010 {
name: "i2b2-2010",
description: "i2b2/VA 2010 NLP Challenge. Clinical concept extraction and relations.",
url: "https://www.i2b2.org/NLP/DataSets/",
entity_types: ["Problem", "Treatment", "Test"],
language: "en",
domain: "clinical",
license: "DUA Required",
citation: "Uzuner et al. (2011)",
paper_url: "https://academic.oup.com/jamia/article/18/5/552/830538",
year: 2010,
format: "Custom",
notes: "Clinical notes; concept and relation extraction",
categories: [ner, relation_extraction, clinical],
},
I2b2Deidentification {
name: "i2b2 De-identification",
description: "i2b2 2014 De-identification Challenge. PHI recognition and removal.",
url: "https://www.i2b2.org/NLP/DataSets/",
entity_types: ["Name", "Date", "Address", "Phone", "SSN", "MRN"],
language: "en",
domain: "clinical",
license: "DUA Required",
citation: "Stubbs et al. (2015)",
year: 2014,
format: "Custom",
notes: "PHI de-identification; HIPAA compliance",
categories: [ner, clinical],
},
FrenchClinicalNER {
name: "French Clinical NER",
description: "French clinical NER from hospital records. APHP collaboration.",
url: "https://github.com/EDS-NLP/eds-nlp",
entity_types: ["Drug", "Disease", "Procedure", "Date"],
language: "fr",
domain: "clinical",
license: "DUA Required",
citation: "APHP Team (2022)",
year: 2022,
format: "Standoff",
notes: "French clinical text; covers multiple entity types",
categories: [ner, clinical, multilingual],
},
ShARe13 {
name: "ShARe/CLEF 2013",
description: "ShARe/CLEF eHealth 2013. Disorder mention recognition.",
url: "https://physionet.org/content/shareclefehealth2013/",
entity_types: ["Disorder"],
language: "en",
domain: "clinical",
license: "PhysioNet",
citation: "Suominen et al. (2013)",
year: 2013,
format: "Standoff",
notes: "Clinical disorder identification; SNOMED CT normalization",
categories: [ner, clinical, discontinuous_ner],
},
ShARe14 {
name: "ShARe/CLEF 2014",
description: "ShARe/CLEF eHealth 2014. Improved disorder normalization.",
url: "https://physionet.org/content/shareclefehealth2014/",
entity_types: ["Disorder"],
language: "en",
domain: "clinical",
license: "PhysioNet",
citation: "Mowery et al. (2014)",
year: 2014,
format: "Standoff",
notes: "Extended from 2013; template filling and normalization",
categories: [ner, clinical, discontinuous_ner],
},
CALCS {
name: "CALCS",
description: "Computational Approaches to Linguistic Code-Switching. Multiple language pairs.",
url: "https://code-switching.github.io/",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "social_media",
license: "Research",
citation: "CALCS Workshop",
year: 2018,
format: "CoNLL",
notes: "Code-switching NER; Spanish-English, Hindi-English",
categories: [ner, multilingual, social_media],
},
LinCE {
name: "LinCE",
description: "Linguistic Code-switching Evaluation. Multiple code-switching benchmarks.",
url: "https://ritual.uh.edu/lince/",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "social_media",
license: "Research",
citation: "Aguilar et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.223/",
year: 2020,
format: "CoNLL",
notes: "Spanish-English, Hindi-English; includes NER task",
categories: [ner, multilingual, social_media],
},
GLUECoS {
name: "GLUECoS",
description: "Code-Switching GLUE benchmark. NLU for code-switched text.",
url: "https://github.com/microsoft/GLUECoS",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "social_media",
license: "MIT",
citation: "Khanuja et al. (2020)",
paper_url: "https://aclanthology.org/2020.emnlp-main.574/",
year: 2020,
format: "JSONL",
notes: "Hindi-English and Spanish-English; NLU tasks",
categories: [ner, multilingual, social_media],
},
ChemDataExtractor {
name: "ChemDataExtractor",
description: "Chemical data extraction toolkit benchmark. Chemical NER and properties.",
url: "https://chemdataextractor.org/",
entity_types: ["Chemical", "Property", "Value", "Unit"],
language: "en",
domain: "biomedical",
license: "MIT",
citation: "Swain & Cole (2016)",
year: 2016,
format: "Custom",
notes: "Chemical property extraction; materials science",
categories: [ner, biomedical],
},
HUPD {
name: "HUPD",
description: "Harvard USPTO Patent Dataset. Patent application NER.",
url: "https://github.com/suzgunmirac/hupd",
entity_types: ["Inventor", "Assignee", "Reference", "Claim"],
language: "en",
domain: "legal",
license: "Public Domain",
citation: "Suzgun et al. (2022)",
year: 2022,
format: "JSONL",
notes: "Patent applications; technical language",
categories: [ner],
},
FinTechPatent {
name: "FinTech Patent NER",
description: "FinTech patent entity extraction. Financial technology domain.",
url: "https://github.com/fintech-patent-ner",
entity_types: ["Technology", "Company", "Product", "Method"],
language: "en",
domain: "financial",
license: "CC-BY-4.0",
citation: "FinTech NER Team (2021)",
year: 2021,
format: "CoNLL",
notes: "FinTech patents; specialized terminology",
categories: [ner],
},
WaterAgriNER {
name: "WaterAgriNER",
description: "Water and agriculture domain NER. Environmental science entities.",
url: "https://github.com/wateragriner",
entity_types: ["Crop", "Chemical", "Equipment", "Location"],
language: "en",
domain: "scientific",
license: "CC-BY-4.0",
citation: "WaterAgriNER Team (2022)",
year: 2022,
format: "CoNLL",
notes: "Agricultural and water management domains",
categories: [ner],
},
WIESPAstro {
name: "WIESP Astrophysics",
description: "WIESP 2022 Astrophysics NER. NASA ADS literature.",
url: "https://ui.adsabs.harvard.edu/",
entity_types: ["Mission", "Instrument", "CelestialObject", "Phenomenon"],
language: "en",
domain: "astrophysics",
license: "Research",
citation: "WIESP Team (2022)",
year: 2022,
format: "JSONL",
notes: "Astrophysics entities; 31 fine-grained types",
categories: [ner, arcane_domain],
},
NERsocialFood {
name: "NER Social Food",
description: "Food-related NER from social media. Recipes and food mentions.",
url: "https://github.com/food-ner/social",
entity_types: ["Food", "Ingredient", "Brand", "Restaurant"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Food NER Team (2021)",
year: 2021,
format: "CoNLL",
notes: "Social media food mentions; informal language",
categories: [ner, social_media],
},
RussianCulturalNER {
name: "Russian Cultural NER",
description: "Russian cultural heritage NER. Museums, artworks, cultural entities.",
url: "https://github.com/russian-cultural-ner",
entity_types: ["Artwork", "Artist", "Museum", "Period", "Style"],
language: "ru",
domain: "encyclopedia",
license: "CC-BY-4.0",
citation: "RuCultural Team (2022)",
year: 2022,
format: "CoNLL",
notes: "Russian cultural heritage; fine-grained art types",
categories: [ner, multilingual],
},
EighteenthCenturyNER {
name: "18th Century NER",
description: "Named entities in 18th century English text. Historical OCR challenges.",
url: "https://github.com/Living-with-machines/",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "en",
domain: "historical",
license: "CC-BY-4.0",
citation: "Living with Machines (2020)",
year: 2020,
format: "CoNLL",
notes: "OCR noise; historical spelling variation",
categories: [ner, historical],
},
SpanishMedievalTEI {
name: "Spanish Medieval TEI",
description: "Medieval Spanish manuscript NER. TEI-encoded historical texts.",
url: "https://github.com/spanish-medieval-nlp",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "es",
domain: "historical",
license: "CC-BY-4.0",
citation: "Spanish Medieval NLP (2021)",
year: 2021,
format: "XML",
notes: "Medieval Castilian; paleographic challenges",
categories: [ner, historical, multilingual],
},
MedievalCzechCharters {
name: "Medieval Czech Charters",
description: "Czech medieval charter NER. Historical legal documents.",
url: "https://github.com/czech-medieval-charters",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "cs",
domain: "historical",
license: "CC-BY-4.0",
citation: "Czech Charter Team (2020)",
year: 2020,
format: "XML",
notes: "Medieval Czech and Latin; charter formulae",
categories: [ner, historical, multilingual],
},
DutchArchaeologyNER {
name: "Dutch Archaeology NER (v2)",
description: "Dutch archaeological excavation reports. DANS archive annotations.",
url: "https://easy.dans.knaw.nl/",
entity_types: ["Site", "Artifact", "Period", "Material"],
language: "nl",
domain: "archaeology",
license: "CC-BY-4.0",
citation: "DANS (2021)",
year: 2021,
format: "Standoff",
notes: "Archaeological domain; ~31k annotations",
categories: [ner, historical, multilingual],
},
GuaraniNER {
name: "Guaraní NER",
description: "Guaraní language NER. South American indigenous language.",
url: "https://github.com/guarani-nlp",
entity_types: ["PER", "LOC", "ORG"],
language: "gn",
domain: "indigenous",
license: "CC-BY-4.0",
citation: "Guaraní NLP Team (2021)",
year: 2021,
format: "CoNLL",
notes: "Low-resource indigenous language; Paraguay official language",
categories: [ner, indigenous, low_resource],
},
ShipiboKoniboNER {
name: "Shipibo-Konibo NER",
description: "Shipibo-Konibo language NER. Peruvian Amazonian language.",
url: "https://github.com/ixa-ehu/shipibo-konibo",
entity_types: ["PER", "LOC", "ORG"],
language: "shp",
domain: "indigenous",
license: "CC-BY-4.0",
citation: "Mager et al. (2018)",
year: 2018,
format: "CoNLL",
notes: "Endangered language; ~3k speakers",
categories: [ner, indigenous, low_resource],
},
NavajoMorph {
name: "Navajo Morphology",
description: "Navajo morphological annotation. North American indigenous language.",
url: "https://github.com/navajo-nlp",
entity_types: ["PER", "LOC"],
language: "nv",
domain: "indigenous",
license: "Research",
citation: "Navajo NLP Team (2020)",
year: 2020,
format: "CoNLLU",
notes: "Complex verb morphology; tonal language",
categories: [ner, indigenous, low_resource],
},
KoCoNovel {
name: "KoCoNovel",
description: "Korean character coreference in 50 modern/contemporary novels. First Korean literary coreference dataset. Four versions: Reader/Omniscient perspective × Separate/Overlapped entity treatment. 178K tokens, 19K mentions, ~1.4K entities.",
url: "https://github.com/storidient/KoCoNovel",
entity_types: ["PER"],
language: "ko",
domain: "fiction",
license: "CC-BY-SA-4.0",
citation: "Kim, Lee & Lee (2024)",
paper_url: "https://arxiv.org/abs/2404.01140",
year: 2024,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "178K tokens, 50 novels, 17975 sentences",
notes: "Mention types: Pronominal 30.7%, Proper Name 22.8%, Single Noun 24.1% (kinship 9.2%, titles 3.1%), Noun Phrase 22.4%. Korean address term culture (호칭 문화) favors kinship over names. Distance stats: Antecedent avg 70.7 tokens, Spread avg 1583.3 tokens. Korean lacks determiners and proper noun markers. Four annotation versions. Morpheme-unit spans. Speaker annotations. IAA: MUC 94.53 F1. BERT baseline: ~62-73% MUC F1.",
categories: [coref, literary, multilingual],
},
OpenBoek {
name: "OpenBoek",
description: "Dutch literary coreference. Open-source Dutch fiction annotation.",
url: "https://github.com/cltl/OpenBoek",
entity_types: ["PER", "LOC", "ORG"],
language: "nl",
domain: "fiction",
license: "CC-BY-4.0",
citation: "OpenBoek Team (2021)",
year: 2021,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
notes: "Dutch novels; literary coreference patterns",
categories: [coref, literary, multilingual],
},
SciCo {
name: "SciCo",
description: "Scientific coreference. Cross-document concept coreference in AI papers.",
url: "https://github.com/allenai/scico",
entity_types: ["Method", "Task", "Dataset"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Cattan et al. (2021)",
paper_url: "https://aclanthology.org/2021.emnlp-main.518/",
year: 2021,
format: "JSONL",
notes: "Scientific concepts; cross-document coreference",
categories: [coref],
},
SemEval2013Task91 {
name: "SemEval-2013 Task 9.1",
description: "Drug-drug interaction extraction. SemEval shared task.",
url: "https://www.cs.york.ac.uk/semeval-2013/task9/",
entity_types: ["Drug", "Drug_n", "Group", "Brand"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Segura-Bedmar et al. (2013)",
paper_url: "https://aclanthology.org/S13-2056/",
year: 2013,
format: "XML",
notes: "Drug-drug interaction; MedLine and DrugBank",
categories: [ner, relation_extraction, biomedical],
},
PDTB3 {
name: "PDTB 3.0 (v2)",
description: "Penn Discourse Treebank 3.0. Discourse relations and connectives.",
url: "https://catalog.ldc.upenn.edu/LDC2019T05",
entity_types: [],
language: "en",
domain: "news",
license: "LDC",
citation: "Prasad et al. (2019)",
year: 2019,
format: "Custom",
notes: "Discourse relations; implicit and explicit connectives",
categories: [coref],
},
WinoPron {
name: "WinoPron",
description: "Winograd pronoun resolution. Commonsense coreference benchmark.",
url: "https://cs.nyu.edu/~davise/papers/WinoPron/",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "Davis & Marcus (2021)",
year: 2021,
format: "Custom",
notes: "Extended Winograd schemas; commonsense reasoning",
categories: [coref],
},
QUOREF {
name: "QUOREF",
description: "Question answering requiring coreference. Reading comprehension.",
url: "https://github.com/allenai/quoref",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "wikipedia",
license: "CC-BY-4.0",
citation: "Dasigi et al. (2019)",
paper_url: "https://aclanthology.org/D19-1606/",
year: 2019,
format: "JSONL",
notes: "QA requiring coreference resolution; Wikipedia paragraphs",
categories: [coref],
},
CoNLL2002Dutch {
name: "CoNLL-2002 Dutch",
description: "Dutch portion of CoNLL-2002 NER shared task. Newspaper text.",
url: "https://www.clips.uantwerpen.be/conll2002/ner/data/ned.testa",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "nl",
domain: "news",
license: "Research",
citation: "Tjong Kim Sang (2002)",
paper_url: "https://aclanthology.org/W02-2024/",
year: 2002,
format: "CoNLL",
annotation_scheme: "BIO",
notes: "Dutch newspaper NER; includes gazetteers",
categories: [ner, multilingual],
},
CoNLL2002Spanish {
name: "CoNLL-2002 Spanish",
description: "Spanish portion of CoNLL-2002 NER shared task. News articles.",
url: "https://www.clips.uantwerpen.be/conll2002/ner/data/esp.testa",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "es",
domain: "news",
license: "Research",
citation: "Tjong Kim Sang (2002)",
paper_url: "https://aclanthology.org/W02-2024/",
year: 2002,
format: "CoNLL",
annotation_scheme: "BIO",
notes: "Spanish EFE news agency articles",
categories: [ner, multilingual],
},
BC2GMFull {
name: "BC2GM Full",
description: "Complete BioCreative II Gene Mention corpus. Extended from BC2GM.",
url: "https://biocreative.bioinformatics.udel.edu/resources/biocreative-ii-corpus/",
entity_types: ["Gene", "Protein"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Smith et al. (2008)",
year: 2008,
format: "IOB2",
notes: "Full corpus including training data",
categories: [ner, biomedical],
},
FinNER {
name: "FinNER",
description: "Finnish named entity recognition. News and Wikipedia text.",
url: "https://github.com/mpsilfern/finer",
entity_types: ["PER", "LOC", "ORG", "DATE", "EVENT"],
language: "fi",
domain: "news",
license: "CC-BY-4.0",
citation: "Ruokolainen et al. (2020)",
year: 2020,
format: "CoNLL",
notes: "Finnish morphologically rich language NER",
categories: [ner, multilingual],
},
LegalNER {
name: "LegalNER",
description: "Legal Named Entity Recognition. Court cases and legislation.",
url: "https://github.com/legal-ner/legal-ner",
entity_types: ["Court", "Judge", "Lawyer", "Party", "Statute", "Case"],
language: "en",
domain: "legal",
license: "CC-BY-4.0",
citation: "LegalNER Team (2021)",
year: 2021,
format: "CoNLL",
notes: "Legal domain entities; US court documents",
categories: [ner],
},
CEREC {
name: "CEREC",
description: "Chinese entity and relation extraction corpus. Web text and news.",
url: "https://github.com/Stardust-hyx/CEREC",
entity_types: ["PER", "LOC", "ORG"],
language: "zh",
domain: "news",
license: "CC-BY-4.0",
citation: "Huang et al. (2021)",
year: 2021,
format: "JSONL",
notes: "Chinese NER and RE; includes nested entities",
categories: [ner, relation_extraction, multilingual],
},
DELICATE {
name: "DELICATE",
description: "Depression, emotion, and linguistic analysis corpus. Mental health text.",
url: "https://github.com/delicate-nlp/delicate",
entity_types: ["Symptom", "Treatment", "Emotion"],
language: "en",
domain: "clinical",
license: "Research",
citation: "DELICATE Team (2022)",
year: 2022,
format: "JSONL",
notes: "Mental health NER; sensitive domain",
categories: [ner, clinical],
},
SciERCNER {
name: "SciERC NER",
description: "Scientific Information Extraction NER. AI paper abstracts.",
url: "https://github.com/allenai/sciie/tree/main/data",
entity_types: ["Task", "Method", "Metric", "Material", "OtherScientificTerm", "Generic"],
language: "en",
domain: "scientific",
license: "Apache-2.0",
citation: "Luan et al. (2018)",
paper_url: "https://aclanthology.org/D18-1360/",
year: 2018,
format: "JSONL",
notes: "6 entity types; includes nested entities and coreference",
categories: [ner, nested_ner, relation_extraction],
},
ULNER {
name: "ULNER",
description: "Ultra-Large Scale NER. Massive silver-standard dataset.",
url: "",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "ULNER Team (2023)",
year: 2023,
format: "JSONL",
notes: "No stable public URL found (prior HuggingFace URL returned 404).",
access_status: Deprecated,
categories: [ner],
},
UniversalNER {
name: "UniversalNER",
description: "Universal NER model benchmark. Multiple domains and languages.",
url: "https://huggingface.co/datasets/universalner/universal_ner",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Zhou et al. (2023)",
paper_url: "https://arxiv.org/abs/2308.03279",
year: 2023,
format: "JSONL",
notes: "ChatGPT-distilled NER model benchmark",
tasks: ["ner"],
hf_id: "universalner/universal_ner",
access_status: Public,
categories: [ner, multilingual],
},
ArrauGenia {
name: "ARRAU GENIA",
description: "ARRAU corpus GENIA portion. Biomedical coreference.",
url: "https://aclanthology.org/2020.codi-1.1/",
entity_types: ["Gene", "Protein", "Cell"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Uryupina et al. (2020)",
year: 2020,
format: "MMAX2",
annotation_scheme: "ARRAU",
notes: "Biomedical portion of ARRAU corpus",
categories: [coref, biomedical],
},
ArrauPear {
name: "ARRAU Pear Stories",
description: "ARRAU Pear Stories portion. Narrative coreference.",
url: "https://aclanthology.org/2020.codi-1.1/",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "narrative",
license: "Research",
citation: "Uryupina et al. (2020)",
year: 2020,
format: "MMAX2",
annotation_scheme: "ARRAU",
notes: "Film retelling narratives; discourse structure",
categories: [coref, literary],
},
ArrauRst {
name: "ARRAU RST",
description: "ARRAU RST-DT portion. Discourse-annotated Wall Street Journal.",
url: "https://aclanthology.org/2020.codi-1.1/",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "news",
license: "Research",
citation: "Uryupina et al. (2020)",
year: 2020,
format: "MMAX2",
annotation_scheme: "ARRAU",
notes: "WSJ with RST discourse structure",
categories: [coref],
},
ArrauTrains {
name: "ARRAU Trains",
description: "ARRAU Trains portion. Task-oriented dialogue coreference.",
url: "https://aclanthology.org/2020.codi-1.1/",
entity_types: ["PER", "LOC", "TIME"],
language: "en",
domain: "dialogue",
license: "Research",
citation: "Uryupina et al. (2020)",
year: 2020,
format: "MMAX2",
annotation_scheme: "ARRAU",
notes: "Task-oriented dialogue; train scheduling domain",
categories: [coref, dialogue],
},
NomBankImplicit {
name: "NomBank Implicit",
description: "Implicit arguments in NomBank. Nominal predicate-argument structures.",
url: "https://catalog.ldc.upenn.edu/LDC2008T23",
entity_types: [],
language: "en",
domain: "news",
license: "LDC",
citation: "Gerber & Chai (2012)",
year: 2012,
format: "Custom",
notes: "Implicit argument recovery; extends NomBank",
categories: [coref],
},
BASHI {
name: "BASHI",
description: "Bangla Shared Task on Information extraction. Bengali NER.",
url: "https://sites.google.com/view/ipm-bashi/",
entity_types: ["PER", "LOC", "ORG"],
language: "bn",
domain: "news",
license: "Research",
citation: "BASHI Team (2020)",
year: 2020,
format: "CoNLL",
notes: "Bengali (Bangla) NER; low-resource setting",
categories: [ner, multilingual, low_resource],
},
ERST {
name: "ERST",
description: "English RST Signalling Corpus. Discourse markers and signals.",
url: "https://github.com/rsttools/signal",
entity_types: [],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Das & Taboada (2018)",
year: 2018,
format: "Custom",
notes: "Discourse signals; extends RST-DT",
categories: [coref],
},
BiTimeBERT {
name: "BiTimeBERT",
description: "Bi-directional temporal relation dataset. Event ordering and duration.",
url: "https://github.com/btime-bert/bitimebert",
entity_types: ["Event", "Time"],
language: "en",
domain: "news",
license: "CC-BY-4.0",
citation: "BiTimeBERT Team (2022)",
year: 2022,
format: "JSONL",
notes: "Temporal reasoning; event-time relations",
categories: [ner, temporal],
},
TRIDIS {
name: "TRIDIS",
description: "Triple Discourse dataset. Entity and discourse relations.",
url: "https://github.com/tridis/tridis",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "TRIDIS Team (2021)",
year: 2021,
format: "JSONL",
notes: "Combined entity and discourse annotation",
categories: [coref],
},
QueerBench {
name: "QueerBench",
description: "Queer identity coreference benchmark. LGBTQ+ representation in NLP.",
url: "https://github.com/queerbench/queerbench",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "QueerBench Team (2022)",
year: 2022,
format: "JSONL",
notes: "Tests coreference for non-binary pronouns; bias evaluation",
categories: [coref, bias_evaluation],
},
QUEEREOTYPES {
name: "QUEEREOTYPES",
description: "LGBTQ+ stereotype detection in text. Bias in language models.",
url: "https://github.com/queereotypes/queereotypes",
entity_types: [],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "Felkner et al. (2023)",
year: 2023,
format: "JSONL",
notes: "Stereotype detection; tests model biases",
categories: [bias_evaluation],
},
MAP {
name: "MAP",
description: "Medical Annotation Pipeline dataset. Clinical concept normalization.",
url: "https://github.com/medical-annotation-pipeline/map",
entity_types: ["Drug", "Disease", "Procedure"],
language: "en",
domain: "clinical",
license: "DUA Required",
citation: "MAP Team (2021)",
year: 2021,
format: "Standoff",
notes: "Clinical concept extraction and normalization",
categories: [ner, clinical],
},
ASN {
name: "ASN",
description: "Atomic Slot Number dataset. Slot filling benchmark.",
url: "http://www.cs.toronto.edu/~varada/ASN/",
entity_types: ["Organization", "Person", "Date"],
language: "en",
domain: "news",
license: "Research",
citation: "Law et al. (2013)",
year: 2013,
format: "Custom",
notes: "Atomic slot filling; relation extraction",
categories: [relation_extraction],
},
CSN {
name: "CSN",
description: "Code Search Net. Programming language dataset for code understanding.",
url: "https://github.com/github/CodeSearchNet",
entity_types: ["Function", "Class", "Variable"],
language: "mul",
domain: "code",
license: "MIT",
citation: "Husain et al. (2019)",
paper_url: "https://arxiv.org/abs/1909.09436",
year: 2019,
format: "JSONL",
notes: "Code entity and function extraction; 6 languages",
categories: [ner],
},
HOMOMEX {
name: "HOMOMEX",
description: "Homonym resolution in Mexican Spanish. Word sense disambiguation.",
url: "https://github.com/homomex/homomex",
entity_types: [],
language: "es",
domain: "general",
license: "CC-BY-4.0",
citation: "HOMOMEX Team (2021)",
year: 2021,
format: "JSONL",
notes: "Mexican Spanish; tests regional variation",
categories: [multilingual],
},
ENER {
name: "ENER",
description: "E-commerce NER. Product entities in e-commerce text.",
url: "https://github.com/ener-dataset/ener",
entity_types: ["Product", "Brand", "Attribute", "Price"],
language: "en",
domain: "e-commerce",
license: "CC-BY-4.0",
citation: "ENER Team (2022)",
year: 2022,
format: "CoNLL",
notes: "E-commerce domain; product catalogs",
categories: [ner],
},
FIREBALL {
name: "FIREBALL",
description: "D&D gameplay NLG with true game state. ~25k sessions, 153k turns with structured game state.",
url: "https://huggingface.co/datasets/lara-martin/FIREBALL",
entity_types: ["Character", "Item", "Location", "Creature", "Spell", "Action"],
language: "en",
domain: "gaming",
license: "CC-BY-4.0",
citation: "Rameshkumar & Bailey (2020)",
paper_url: "https://par.nsf.gov/biblio/10463286",
year: 2020,
format: "JSONL",
size_hint: "~25k sessions, 153k turns",
notes: "D&D actual play with structured game state; tests NLG in narrative gaming",
categories: [ner, dialogue],
},
DnDNERBenchmark {
name: "D&D NER Benchmark",
description: "Fantasy NER from 7 D&D adventure books. LLM-annotated fantasy entities.",
url: "https://aclanthology.org/2023.ranlp-1.130.pdf",
entity_types: ["Character", "Location", "Item", "Creature", "Spell", "Organization"],
language: "en",
domain: "gaming",
license: "Research",
citation: "Veselovsky et al. (2023)",
paper_url: "https://aclanthology.org/2023.ranlp-1.130/",
year: 2023,
format: "CoNLL",
notes: "Fantasy domain; Flair/Trankit/SpaCy benchmarks; tests fictional entity recognition",
categories: [ner, literary],
},
CriticalRoleDataset {
name: "Critical Role Dataset",
description: "Unscripted live D&D transcripts. Storytelling and dialogue analysis.",
url: "https://www.microsoft.com/en-us/research/wp-content/uploads/2020/06/R.Rameshkumar-and-P.Bailey-Storytelling-with-Dialogue-ACL2020.pdf",
entity_types: ["Character", "Location", "Item"],
language: "en",
domain: "gaming",
license: "Research",
citation: "Rameshkumar & Bailey (2020)",
paper_url: "https://aclanthology.org/2020.acl-main.459/",
year: 2020,
format: "Custom",
notes: "Live improvised gameplay transcripts; narrative coherence and character tracking",
categories: [ner, dialogue, literary],
},
CUAD {
name: "CUAD",
description: "Contract Understanding Atticus Dataset. 13k+ labels across 510 commercial contracts.",
url: "https://www.atticusprojectai.org/cuad",
entity_types: ["Party", "Date", "Amount", "Clause", "Jurisdiction"],
language: "en",
domain: "legal",
license: "CC-BY-4.0",
citation: "Hendrycks et al. (2021)",
paper_url: "https://arxiv.org/abs/2103.06268",
year: 2021,
format: "JSONL",
size_hint: "510 contracts, 13k+ annotations, 41 clause types",
notes: "Contract clause extraction; covers indemnification, IP, termination clauses",
categories: [ner],
},
ACORD {
name: "ACORD",
description: "Expert-annotated clause retrieval for contract drafting. 114 queries, 126k+ pairs.",
url: "https://arxiv.org/html/2501.06582v1",
entity_types: ["Clause", "Party", "Obligation", "Condition"],
language: "en",
domain: "legal",
license: "Research",
citation: "ACORD Team (2025)",
paper_url: "https://arxiv.org/abs/2501.06582",
year: 2025,
format: "JSONL",
size_hint: "114 queries, 126k+ query-clause pairs with 1-5 star rankings",
notes: "Clause retrieval; Limitation of Liability, Indemnification, MFN clauses",
categories: [ner],
},
PartyExtractionDataset {
name: "Party Extraction Dataset",
description: "Legal party identification from contracts. Contextual span representations.",
url: "https://aclanthology.org/2023.ranlp-1.116.pdf",
entity_types: ["Party", "Role", "Organization"],
language: "en",
domain: "legal",
license: "Research",
citation: "Tuggener et al. (2023)",
paper_url: "https://aclanthology.org/2023.ranlp-1.116/",
year: 2023,
format: "Standoff",
notes: "Legal party NER; disambiguates parties in complex contract structures",
categories: [ner],
},
FINERFood {
name: "FINER (Food)",
description: "Food ingredient NER. 181k ingredient phrases in IOB2 format.",
url: "https://figshare.com/articles/dataset/Food_Ingredient_Named-Entity_Data/20222361",
entity_types: ["Ingredient", "Product", "Quantity", "Unit", "State"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Popovski et al. (2022)",
year: 2022,
format: "BIO",
size_hint: "181,970 ingredient phrases",
notes: "Semi-supervised multi-model prediction for ingredient parsing",
categories: [ner, arcane_domain],
},
NHKRecipeDataset {
name: "NHK Recipe Dataset",
description: "Japanese recipes with ingredient state tracking across cooking steps.",
url: "https://arxiv.org/html/2507.17232v1",
entity_types: ["Ingredient", "Action", "State", "Tool"],
language: "ja",
domain: "food",
license: "Research",
citation: "NHK Team (2025)",
paper_url: "https://arxiv.org/abs/2507.17232",
year: 2025,
format: "JSONL",
notes: "State transitions per ingredient; procedural understanding in Japanese",
categories: [ner, multilingual, arcane_domain],
},
SanskritNERBhagavadGita {
name: "Sanskrit NER (Bhagavad Gita)",
description: "Sanskrit NER from Bhagavad Gita and Patanjali Yoga Sutras.",
url: "https://www.kaggle.com/datasets/akashsuklabaidya/ner-dataset-fyp-25",
entity_types: ["PER", "LOC", "ORG", "CONCEPT"],
language: "sa",
domain: "religious",
license: "Research",
citation: "Suklabaidya (2025)",
year: 2025,
format: "CoNLL",
notes: "Classical Sanskrit texts; tests Indic script and religious terminology",
categories: [ner, ancient, arcane_domain],
},
Mahanama {
name: "Mahānāma",
description: "Sanskrit Entity Discovery and Linking from Mahābhārata. World's largest epic with extreme name variation.",
url: "https://github.com/sujoysarkarai/mahanama",
entity_types: ["Person", "Location", "Miscellaneous"],
language: "sa",
domain: "literary",
license: "CC-BY-4.0",
citation: "Sarkar et al. (2025)",
paper_url: "https://arxiv.org/abs/2509.19844",
year: 2025,
format: "CoNLLU",
annotation_scheme: "Standoff",
size_hint: "988K tokens, 73K verses, 109K mentions, 5.5K entities",
notes: "First large-scale Sanskrit literary EDL. Character-level boundaries for sandhi MWTs (39% of mentions). Cross-lingual KB in English. SLP1 encoding. Extreme challenges: 124.42 avg name forms per major entity (max 1385 for Śiva), 47% entity ambiguity. Best baseline: 51.57% coref F1, 64.19% EL F1.",
splits: ["train", "dev", "test"],
tasks: ["ner", "coref", "el"],
categories: [coref, literary, ancient, long_document, arcane_domain, low_resource],
},
AkkadianCuneiformDataset {
name: "Akkadian Cuneiform Dataset",
description: "Unicode cuneiform with transliteration. Old/Middle Babylonian, Neo-Assyrian.",
url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC7592802/",
entity_types: ["Person", "Place", "God", "Object"],
language: "akk",
domain: "historical",
license: "CC-BY-4.0",
citation: "Gordin et al. (2020)",
paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC7592802/",
year: 2020,
format: "Custom",
notes: "Cuneiform glyphs with segmentation; covers ~2000 years of Mesopotamian text",
categories: [ner, ancient, historical],
},
HeidelbergCuneiformBenchmark {
name: "Heidelberg Cuneiform Benchmark",
description: "Cuneiform sign classification across historical periods.",
url: "https://direct.mit.edu/coli/article/49/3/703/116160",
entity_types: ["Sign", "Determinative", "Logogram"],
language: "akk",
domain: "historical",
license: "Research",
citation: "Heidelberg Team (2023)",
paper_url: "https://direct.mit.edu/coli/article/49/3/703/116160",
year: 2023,
format: "Custom",
notes: "Sign-level classification; tests paleographic variation across periods",
categories: [ner, ancient, historical],
},
GreekMythologyKG {
name: "Greek Mythology Knowledge Graph",
description: "Coref + RE pipeline for mythological texts. 15k+ entities from Roscher's Lexikon.",
url: "https://www.semantic-web-journal.net/system/files/swj2754.pdf",
entity_types: ["Deity", "Hero", "Place", "Creature", "Object", "Event"],
language: "en",
domain: "mythology",
license: "CC-BY-4.0",
citation: "Myth KG Team (2019)",
paper_url: "https://www.semantic-web-journal.net/content/greek-mythology-knowledge-graph",
year: 2019,
format: "Custom",
notes: "RDF conversion of mythological texts; handles divine genealogies and epithets",
categories: [ner, coref, arcane_domain],
},
FolkloreMotifDistribution {
name: "Folklore Motif Distribution",
description: "548 folklore motifs across 309 ethnic traditions in the Old World.",
url: "https://www.academia.edu/14481230/",
entity_types: ["Motif", "Tradition", "Region", "Character"],
language: "mul",
domain: "mythology",
license: "Research",
citation: "Berezkin et al. (2015)",
year: 2015,
format: "Custom",
notes: "Cross-cultural motif tracking; tests cultural entity alignment",
categories: [ner, multilingual, arcane_domain],
},
NDNER {
name: "ND-NER",
description: "National defense OSINT NER. 17+ entity types for military equipment.",
url: "https://github.com/XinyanLi2016/ND-NER",
entity_types: ["AIRCRAFT", "SHIP", "MISSILE", "TANK", "FIREARM", "ELECTRONIC", "MASS_DESTR", "SPACE", "NEW"],
language: "en",
domain: "defense",
license: "CC-BY-SA-4.0",
citation: "Li et al. (2022)",
year: 2022,
format: "CoNLL",
notes: "Nested and flat versions; covers WMDs, directed energy, kinetic weapons",
categories: [ner, nested_ner, arcane_domain],
},
Re3dDefense {
name: "re3d (Defense)",
description: "Relationship and Entity Extraction Evaluation Dataset for defense domain.",
url: "https://github.com/dstl/re3d",
entity_types: ["Person", "Organization", "Location", "Equipment", "Event"],
language: "en",
domain: "defense",
license: "OGL",
citation: "DSTL (2016)",
year: 2016,
format: "BRAT",
notes: "UK Defence Science; relationship extraction for intelligence analysis",
categories: [ner, relation_extraction, arcane_domain],
},
CyNERAptner {
name: "CyNER-APTNER",
description: "Unified cyber threat intelligence NER. Malware, threat actors, IOCs.",
url: "https://ceur-ws.org/Vol-3928/paper_170.pdf",
entity_types: ["Malware", "ThreatActor", "Vulnerability", "Indicator", "Tool"],
language: "en",
domain: "cybersecurity",
license: "Research",
citation: "CyNER Team (2024)",
paper_url: "https://ceur-ws.org/Vol-3928/paper_170.pdf",
year: 2024,
format: "CoNLL",
notes: "Merged cyber threat datasets; security bulletin extraction",
categories: [ner, arcane_domain],
},
ChineseEngineeringGeologyNER {
name: "Chinese Engineering Geology NER",
description: "Geological disasters NER with EDA-based augmentation for small samples.",
url: "https://www.sciencedirect.com/science/article/abs/pii/S0957417423024272",
entity_types: ["Disaster", "Location", "Cause", "Measure", "Material"],
language: "zh",
domain: "geology",
license: "Research",
citation: "Geology NER Team (2023)",
paper_url: "https://doi.org/10.1016/j.eswa.2023.122427",
year: 2023,
format: "BIO",
notes: "Engineering geology reports; data augmentation for low-resource domain",
categories: [ner, multilingual, arcane_domain],
},
LLMRocMinNER {
name: "LLM-RocMin-NER",
description: "Rocks and minerals NER. 2-shot prompt-based extraction with nested handling.",
url: "https://www.sciencedirect.com/science/article/abs/pii/S0098300425000949",
entity_types: ["Rock", "Mineral", "Element", "Property", "Location"],
language: "en",
domain: "geology",
license: "CC-BY-4.0",
citation: "RocMin Team (2025)",
paper_url: "https://doi.org/10.1016/j.cageo.2025.105949",
year: 2025,
format: "JSONL",
notes: "Few-shot geoscience NER; handles nested mineral compositions",
categories: [ner, nested_ner, arcane_domain],
},
PolyIE {
name: "PolyIE",
description: "Polymer materials NER + relation extraction from literature.",
url: "https://ramprasad.mse.gatech.edu/PolyIE/",
entity_types: ["Polymer", "Property", "Value", "Condition", "Method"],
language: "en",
domain: "materials",
license: "CC-BY-4.0",
citation: "Shetty et al. (2024)",
paper_url: "https://aclanthology.org/2024.naacl-long.131/",
year: 2024,
format: "JSONL",
notes: "Polymer science literature; property-structure relationships",
categories: [ner, relation_extraction, arcane_domain],
},
MathDial {
name: "MathDial",
description: "Teacher-student tutoring dialogues on multi-step math problems.",
url: "https://arxiv.org/abs/2305.14536",
entity_types: ["Student", "Teacher", "Problem", "Step", "Hint"],
language: "en",
domain: "education",
license: "CC-BY-4.0",
citation: "Macina et al. (2023)",
paper_url: "https://arxiv.org/abs/2305.14536",
year: 2023,
format: "JSONL",
size_hint: "3,000 tutoring dialogues",
notes: "Scaffolding questions taxonomy; tests pedagogical dialogue understanding",
categories: [ner, dialogue, arcane_domain],
},
CoMTA {
name: "CoMTA",
description: "Student-GPT4 Khanmigo tutor dialogues for knowledge tracing.",
url: "https://learninganalytics.upenn.edu/ryanbaker/",
entity_types: ["Student", "Tutor", "Concept", "Question", "Response"],
language: "en",
domain: "education",
license: "Research",
citation: "Baker et al. (2025)",
year: 2025,
format: "JSONL",
size_hint: "188 dialogues",
notes: "LLM tutoring evaluation; knowledge tracing in AI tutors",
categories: [ner, dialogue, arcane_domain],
},
FrenchFullLengthFictionCoref {
name: "French Full-Length Fiction Coreference",
description: "Complete French novels spanning three centuries with character coreference.",
url: "https://arxiv.org/html/2510.15594v1",
entity_types: ["Character", "Location", "Organization"],
language: "fr",
domain: "fiction",
license: "CC-BY-4.0",
citation: "French Fiction Team (2025)",
paper_url: "https://arxiv.org/abs/2510.15594",
year: 2025,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
notes: "Full novels with gender inference; tests long-document literary coref",
categories: [coref, literary, multilingual, long_document],
},
WinogradSchemaChallengeWSC {
name: "Winograd Schema Challenge",
description: "Pronoun resolution requiring world knowledge. 273 sentence pairs.",
url: "https://cs.nyu.edu/~davise/papers/WinoPron/WSCollection.xml",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "Levesque et al. (2012)",
paper_url: "https://aclanthology.org/N15-1117/",
year: 2012,
format: "XML",
size_hint: "273 sentence pairs",
notes: "Commonsense reasoning benchmark; tests world knowledge in coreference",
categories: [coref, bias_evaluation],
},
TVShowMultilingualCoref {
name: "TV Show Multilingual Coreference",
description: "English TV show transcripts with projections to Chinese and Farsi.",
url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00581/117162",
entity_types: ["Character", "Location", "Object"],
language: "mul",
domain: "dialogue",
license: "Research",
citation: "Khosla et al. (2023)",
paper_url: "https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00581",
year: 2023,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
notes: "Cross-lingual projection via subtitles; multiparty TV dialogue",
categories: [coref, multilingual, dialogue],
},
VisDialCoref {
name: "VisDial Coreference",
description: "Visual dialog with 120k images and 10-turn dialogs requiring visual coref.",
url: "https://www.sciencedirect.com/science/article/pii/S266729522300082X",
entity_types: ["Object", "Person", "Location"],
language: "en",
domain: "vision",
license: "CC-BY-4.0",
citation: "Das et al. (2017)",
paper_url: "https://arxiv.org/abs/1611.08669",
year: 2017,
format: "JSONL",
size_hint: "120k images, 10-turn dialogs",
notes: "Visual coreference; grounding referents in images",
categories: [coref, dialogue],
},
RISeC {
name: "RISeC",
description: "Procedural cooking text with temporal relations and manner descriptions.",
url: "https://arxiv.org/html/2411.18157v1",
entity_types: ["Ingredient", "Tool", "Action", "State", "Time"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "RISeC Team (2024)",
paper_url: "https://arxiv.org/abs/2411.18157",
year: 2024,
format: "Standoff",
notes: "Procedural coreference; tracks ingredient state through cooking steps",
categories: [coref, arcane_domain],
},
EFGC {
name: "EFGC",
description: "Cooking coreference segmented by tools, foods, and actions.",
url: "https://arxiv.org/html/2411.18157v1",
entity_types: ["Food", "Tool", "Action"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "EFGC Team (2024)",
paper_url: "https://arxiv.org/abs/2411.18157",
year: 2024,
format: "CoNLL",
notes: "Entity flow graphs for cooking; tracks transformations",
categories: [coref, arcane_domain],
},
SPoRC {
name: "SPoRC",
description: "Structured Podcast Research Corpus. 1.1M episodes with host/guest extraction.",
url: "https://arxiv.org/html/2411.07892v1",
entity_types: ["Host", "Guest", "Organization", "Topic"],
language: "en",
domain: "speech",
license: "Research",
citation: "SPoRC Team (2024)",
paper_url: "https://aclanthology.org/2025.acl-long.1222/",
year: 2024,
format: "JSONL",
size_hint: "1.1M podcast episodes",
notes: "Speaker diarization; host/guest inference from transcripts",
categories: [ner, speech, dialogue],
},
ARFFiction {
name: "ARF (Artificial Relationships in Fiction)",
description: "Synthetic RE dataset for literary texts. GPT-4o generated annotations.",
url: "https://aclanthology.org/2025.latechclfl-1.13.pdf",
entity_types: ["Character", "Location", "Object", "Event"],
language: "en",
domain: "fiction",
license: "CC-BY-4.0",
citation: "ARF Team (2025)",
paper_url: "https://aclanthology.org/2025.latechclfl-1.13/",
year: 2025,
format: "JSONL",
notes: "Literary relationship extraction; synthetic from public domain fiction",
categories: [relation_extraction, literary],
},
CRAFTCorpusCoref {
name: "CRAFT Corpus (Full Coref)",
description: "Biomedical coref with ~30k relations. 23% span 500-12k words.",
url: "https://github.com/UCDenver-ccp/CRAFT",
entity_types: ["Gene", "Protein", "Cell", "Organism", "Chemical"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Cohen et al. (2017)",
paper_url: "https://arxiv.org/html/2510.25087v1",
year: 2017,
format: "Standoff",
size_hint: "97 full-text PubMed articles, ~30k coref relations",
notes: "Long-range dependencies; identity and appositive links; tests long-document coref",
categories: [coref, biomedical, long_document],
},
AerospaceNERDataset {
name: "Aerospace NER Dataset",
description: "First open-source aerospace NER. 5 entity types for aviation knowledge graphs.",
url: "https://arc.aiaa.org/doi/10.2514/1.I011251",
entity_types: ["Aircraft", "Component", "Manufacturer", "Mission", "System"],
language: "en",
domain: "aerospace",
license: "Research",
citation: "AIAA (2023)",
paper_url: "https://arc.aiaa.org/doi/10.2514/1.I011251",
year: 2023,
format: "CoNLL",
notes: "Aviation product knowledge graphs; technical aerospace terminology",
categories: [ner, arcane_domain],
},
AviationProductsNER {
name: "Aviation Products NER",
description: "Chinese aviation manufacturing corpus. Complex product entities.",
url: "https://dspace.lib.cranfield.ac.uk/server/api/core/bitstreams/a59ed640-4783-4ddb-871b-6fd8bd0e7400/content",
entity_types: ["Product", "Component", "Process", "Material"],
language: "zh",
domain: "aerospace",
license: "Research",
citation: "Cranfield (2022)",
year: 2022,
format: "BIO",
notes: "Aviation manufacturing technical documents in Chinese",
categories: [ner, multilingual, arcane_domain],
},
VREN {
name: "VREN (Volleyball)",
description: "Volleyball rally descriptions for tactical statistics extraction.",
url: "https://arxiv.org/html/2406.12252v1",
entity_types: ["Player", "Action", "Position", "Team", "Score"],
language: "en",
domain: "sports",
license: "CC-BY-4.0",
citation: "VREN Team (2024)",
paper_url: "https://arxiv.org/abs/2406.12252",
year: 2024,
format: "JSONL",
notes: "Sports NLG; tactical action recognition from natural language",
categories: [ner, arcane_domain],
},
FashionIQ {
name: "Fashion IQ",
description: "77k fashion images with relative captions. 1000 attribute labels.",
url: "https://github.com/XiaoxiaoGuo/fashion-iq",
entity_types: ["Texture", "Fabric", "Shape", "Part", "Style", "Color"],
language: "en",
domain: "fashion",
license: "Research",
citation: "Wu et al. (2021)",
paper_url: "https://users.cs.utah.edu/~ziad/papers/cvpr_2021_fashion_iq.pdf",
year: 2021,
format: "JSONL",
size_hint: "77k images, 1000 attribute labels",
notes: "Dialog-based fashion retrieval; fine-grained attribute extraction",
categories: [ner, arcane_domain],
},
NaturalProductsRE {
name: "Natural Products RE",
description: "Relation extraction in underexplored biomedical domains. Diversity-sampled entities.",
url: "https://direct.mit.edu/coli/article/50/3/953/121178",
entity_types: ["NaturalProduct", "Organism", "Activity", "Target"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Hettiarachchi et al. (2024)",
paper_url: "https://direct.mit.edu/coli/article/50/3/953/121178",
year: 2024,
format: "JSONL",
notes: "LOTUS-derived NP dataset; synthetic data generation achieved F1=59.0",
categories: [relation_extraction, biomedical],
},
DrugProtBioCreative {
name: "DrugProt",
description: "Chemical-protein interactions from BioCreative VII challenge.",
url: "https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-1/",
entity_types: ["Chemical", "Gene", "Protein"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "BioCreative VII (2021)",
paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baac098/6833204",
year: 2021,
format: "BRAT",
notes: "Drug-protein interaction classification; BioCreative shared task",
categories: [relation_extraction, biomedical],
},
MOFDataset {
name: "MOF Dataset",
description: "Metal-organic frameworks joint NER+RE. GPT-3/Llama extraction.",
url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
entity_types: ["MOF", "Linker", "Metal", "Property", "Application"],
language: "en",
domain: "materials",
license: "CC-BY-4.0",
citation: "MOF Team (2024)",
paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
year: 2024,
format: "JSONL",
notes: "Metal-organic framework literature; LLM-based extraction pipeline",
categories: [ner, relation_extraction, arcane_domain],
},
SolidStateDoping {
name: "Solid-State Doping",
description: "Impurity doping in materials. Joint NER+RE from literature.",
url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
entity_types: ["Host", "Dopant", "Property", "Concentration", "Method"],
language: "en",
domain: "materials",
license: "CC-BY-4.0",
citation: "Doping Team (2024)",
paper_url: "https://pmc.ncbi.nlm.nih.gov/articles/PMC10869356/",
year: 2024,
format: "JSONL",
notes: "Semiconductor doping literature; tests materials science terminology",
categories: [ner, relation_extraction, arcane_domain],
},
AgriNER {
name: "AgriNER",
description: "Agricultural knowledge graph construction. 36 entity types, 9 relation types.",
url: "https://2023.eswc-conferences.org/wp-content/uploads/2023/05/paper_De_2023_AgriNER.pdf",
entity_types: ["Crop", "Disease", "Soil", "Pathogen", "Pesticide", "Product"],
language: "en",
domain: "agriculture",
license: "Research",
citation: "De et al. (2023)",
paper_url: "https://2023.eswc-conferences.org/AgriNER/",
year: 2023,
format: "JSONL",
notes: "Agricultural KG construction; covers crops, diseases, soil, pathogens",
categories: [ner, relation_extraction, arcane_domain],
},
AGRONER {
name: "AGRONER",
description: "Unsupervised agricultural NER. Six major agricultural entity types.",
url: "https://www.sciencedirect.com/science/article/abs/pii/S0957417423009429",
entity_types: ["Disease", "Soil", "Pathogen", "Pesticide", "Crop", "Product"],
language: "en",
domain: "agriculture",
license: "Research",
citation: "AGRONER Team (2023)",
paper_url: "https://doi.org/10.1016/j.eswa.2023.121001",
year: 2023,
format: "BIO",
notes: "Unsupervised approach; no manual annotation required",
categories: [ner, arcane_domain],
},
AgMNER {
name: "AgMNER",
description: "Chinese multimodal agricultural NER. Text and speech combined.",
url: "https://www.nature.com/articles/s41598-025-88874-9",
entity_types: ["Crop", "Disease", "Pest", "Method"],
language: "zh",
domain: "agriculture",
license: "CC-BY-4.0",
citation: "AgMNER Team (2025)",
paper_url: "https://www.nature.com/articles/s41598-025-88874-9",
year: 2025,
format: "JSONL",
notes: "Multimodal NER; combines text and speech for agricultural domain",
categories: [ner, multilingual, speech, arcane_domain],
},
PolishCoreferenceCorpus {
name: "Polish Coreference Corpus",
description: "Polish coreference resolution corpus. General domain Polish text.",
url: "http://zil.ipipan.waw.pl/PolishCoreferenceCorpus",
entity_types: ["PER", "ORG", "LOC"],
language: "pl",
domain: "general",
license: "CC-BY-SA-4.0",
citation: "Ogrodniczuk et al. (2015)",
year: 2015,
format: "Custom",
annotation_scheme: "Custom",
notes: "Polish morphological complexity; rich inflection system",
categories: [coref, multilingual],
},
ArabicEventCoref {
name: "Arabic Event Coreference",
description: "Arabic event coreference. Underexplored language for event coref.",
url: "https://dl.acm.org/doi/10.1145/3743047",
entity_types: ["Event", "Time", "Location", "Participant"],
language: "ar",
domain: "news",
license: "Research",
citation: "Arabic Event Coref Team (2024)",
paper_url: "https://dl.acm.org/doi/10.1145/3743047",
year: 2024,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
notes: "Arabic event coreference; RTL script; underexplored language",
categories: [coref, event_coref, multilingual],
},
HindiEnglishSocialMediaNER {
name: "Hindi-English Social Media NER",
description: "Code-switched Hindi-English NER from social media.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["PER", "LOC", "ORG"],
language: "hi-en",
domain: "social_media",
license: "Research",
citation: "Hindi-English NER Team",
year: 2018,
format: "CoNLL",
notes: "Code-switching between Hindi (Devanagari) and English; social media",
categories: [ner, multilingual, social_media, low_resource],
},
AstroBERTCorpus {
name: "astroBERT Corpus",
description: "Domain-specific BERT trained on 395k astronomical papers.",
url: "https://arxiv.org/html/2310.17892v2",
entity_types: ["CelestialObject", "Mission", "Instrument", "Phenomenon"],
language: "en",
domain: "astronomy",
license: "Research",
citation: "Grezes et al. (2023)",
paper_url: "https://arxiv.org/abs/2310.17892",
year: 2023,
format: "Custom",
size_hint: "395,499 astronomical papers",
notes: "Domain-adapted BERT for astronomical entity extraction",
categories: [ner, arcane_domain],
},
AstronomicalTelegramKEE {
name: "Astronomical Telegram KEE",
description: "Event IDs, object names, telescope names from GCN Circulars.",
url: "https://www.raa-journal.org/issues/all/2024/v24n6/202405/",
entity_types: ["EventID", "ObjectName", "TelescopeName", "Observatory"],
language: "en",
domain: "astronomy",
license: "Research",
citation: "KEE Team (2024)",
paper_url: "https://www.raa-journal.org/issues/all/2024/v24n6/202405/",
year: 2024,
format: "JSONL",
notes: "LLM extraction from GCN Circulars; astronomical event reports",
categories: [ner, arcane_domain],
},
Saraga {
name: "Saraga",
description: "Indian Art Music dataset. Carnatic and Hindustani traditions.",
url: "https://arxiv.org/pdf/2309.16396.pdf",
entity_types: ["Raaga", "Taala", "Artist", "Composition", "Instrument"],
language: "mul",
domain: "music",
license: "CC-BY-4.0",
citation: "Saraga Team (2023)",
paper_url: "https://arxiv.org/abs/2309.16396",
year: 2023,
format: "JSONL",
notes: "Indian classical music; Carnatic/Hindustani metadata extraction",
categories: [ner, multilingual, arcane_domain],
},
MusicBrainzRE {
name: "MusicBrainz RE",
description: "Music metadata relations from Freebase/MusicBrainz. 116M instances.",
url: "https://web.stanford.edu/~jurafsky/mintz.pdf",
entity_types: ["Artist", "Album", "Track", "Label", "Genre"],
language: "en",
domain: "music",
license: "CC0",
citation: "Mintz et al. (2009)",
paper_url: "https://web.stanford.edu/~jurafsky/mintz.pdf",
year: 2009,
format: "Custom",
size_hint: "116 million instances, 7,300 binary relations",
notes: "Distant supervision from Freebase; music metadata relations",
categories: [relation_extraction, arcane_domain],
},
DINAA {
name: "DINAA",
description: "Digital Index of North American Archaeology. Geospatial heritage data.",
url: "https://ux.opencontext.org/endangered-data-and-the-digital-index-of-north-american-archaeology-dinaa/",
entity_types: ["Site", "Artifact", "Culture", "Period", "Location"],
language: "en",
domain: "archaeology",
license: "CC-BY-4.0",
citation: "DINAA Team",
year: 2015,
format: "Custom",
notes: "North American archaeological sites; geospatial heritage preservation",
categories: [ner, arcane_domain],
},
IMDbSemiStructuredRE {
name: "IMDb Semi-Structured RE",
description: "Distantly supervised extraction from structured web content.",
url: "https://www.vldb.org/pvldb/vol11/p1084-lockard.pdf",
entity_types: ["Movie", "Person", "Role", "Date", "Award"],
language: "en",
domain: "entertainment",
license: "Research",
citation: "Lockard et al. (2018)",
paper_url: "https://www.vldb.org/pvldb/vol11/p1084-lockard.pdf",
year: 2018,
format: "JSONL",
notes: "Web table extraction; semi-structured movie database relations",
categories: [relation_extraction, arcane_domain],
},
ATISFlightBooking {
name: "ATIS Flight Booking",
description: "Slot-filling NER for flight booking intents. Classic NLU benchmark.",
url: "https://github.com/yvchen/JointSLU",
entity_types: ["FromCity", "ToCity", "DepartDate", "ReturnDate", "Airline", "FlightNumber"],
language: "en",
domain: "travel",
license: "Research",
citation: "Hemphill et al. (1990)",
year: 1990,
format: "BIO",
notes: "Classic slot-filling benchmark; spoken language understanding",
categories: [ner],
},
PaleontologyNER {
name: "Paleontology NER",
description: "Dinosaurs, mammals, and river ecosystems entity retrieval.",
url: "https://aclanthology.org/anthology-files/anthology-files/pdf/findings/2023.findings-emnlp.218v1.pdf",
entity_types: ["Taxon", "Location", "TimePeriod", "Formation", "Specimen"],
language: "en",
domain: "paleontology",
license: "Research",
citation: "Paleo NER Team (2023)",
paper_url: "https://aclanthology.org/2023.findings-emnlp.218/",
year: 2023,
format: "CoNLL",
notes: "Paleontological literature; fossil taxa and geological formations",
categories: [ner, arcane_domain],
},
WaterResourceNER {
name: "Water Resource NER",
description: "Domain-adaptive NER for AI-driven water resource management.",
url: "https://www.frontiersin.org/journals/environmental-science/articles/10.3389/fenvs.2025.1558317/pdf",
entity_types: ["WaterBody", "Infrastructure", "Pollutant", "Measurement", "Policy"],
language: "en",
domain: "environment",
license: "CC-BY-4.0",
citation: "Water NER Team (2025)",
paper_url: "https://www.frontiersin.org/articles/10.3389/fenvs.2025.1558317/",
year: 2025,
format: "BIO",
notes: "Water management domain; infrastructure and policy entities",
categories: [ner, arcane_domain],
},
MalwareTextDB {
name: "MalwareTextDB",
description: "Annotated malware articles for cybersecurity NER.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Malware", "Vulnerability", "Tool", "ThreatActor", "IOC"],
language: "en",
domain: "cybersecurity",
license: "Research",
citation: "MalwareTextDB Team",
year: 2017,
format: "BRAT",
notes: "Security bulletin extraction; malware family identification",
categories: [ner, arcane_domain],
},
SECFilingsNER {
name: "SEC-filings",
description: "Finance domain NER from SEC filing documents.",
url: "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/test/FIN3.txt",
entity_types: ["Company", "Person", "Money", "Date", "Percentage"],
language: "en",
domain: "finance",
license: "CC-BY-3.0",
citation: "SEC-filings Team",
year: 2018,
format: "CoNLL",
notes: "Financial documents; SEC 10-K and 10-Q filings",
categories: [ner],
},
AnEM {
name: "AnEM",
description: "Anatomical entity mentions corpus. Anatomy terms in biomedical text.",
url: "http://www.nactem.ac.uk/anatomy/",
entity_types: ["AnatomicalStructure", "Organ", "Tissue", "Cell", "OrganismSubdivision"],
language: "en",
domain: "biomedical",
license: "CC-BY-SA-3.0",
citation: "Ohta et al. (2012)",
year: 2012,
format: "Standoff",
notes: "Anatomical entity corpus; fine-grained anatomy typing",
categories: [ner, biomedical],
},
RecipeDBAnnotated {
name: "RecipeDB Annotated",
description: "88k ingredient phrases via clustering-based sampling with Stanford NER.",
url: "https://aclanthology.org/2024.lrec-main.406/",
entity_types: ["Ingredient", "Quantity", "Unit", "Preparation"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "RecipeDB Team (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.406/",
year: 2024,
format: "JSONL",
size_hint: "88,526 ingredient phrases",
notes: "Clustering-based annotation; Stanford NER pipeline",
categories: [ner, arcane_domain],
},
RitterTwitterNER {
name: "Ritter Twitter NER",
description: "Twitter NER dataset with diverse entity types from tweets.",
url: "https://github.com/aritter/twitter_nlp",
entity_types: ["PER", "LOC", "ORG", "PRODUCT", "FACILITY", "BAND", "SPORTSTEAM"],
language: "en",
domain: "social_media",
license: "Research",
citation: "Ritter et al. (2011)",
paper_url: "https://aclanthology.org/D11-1141/",
year: 2011,
format: "CoNLL",
notes: "Early Twitter NER; 10 entity types including bands and sports teams",
categories: [ner, social_media],
},
MusicNER {
name: "Music-NER",
description: "Music domain entities. Artists, albums, songs, genres.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Artist", "Album", "Song", "Genre", "Instrument", "Label"],
language: "en",
domain: "music",
license: "MIT",
citation: "Music-NER Team",
year: 2020,
format: "CoNLL",
notes: "Music domain NER; includes record labels and instrument types",
categories: [ner, arcane_domain],
},
TutoringSessionsAlgebra {
name: "500 Tutoring Sessions",
description: "32k utterances from elementary algebra/physics tutoring. Mode identification.",
url: "https://aclanthology.org/C16-1188.pdf",
entity_types: ["Student", "Tutor", "Concept", "Problem"],
language: "en",
domain: "education",
license: "Research",
citation: "Boyer et al. (2016)",
paper_url: "https://aclanthology.org/C16-1188/",
year: 2016,
format: "Custom",
size_hint: "500 sessions, 32,368 utterances",
notes: "Tutoring mode identification; algebra and physics domains",
categories: [ner, dialogue, arcane_domain],
},
GNERGeoscience {
name: "GNER",
description: "Chinese geological entities from geoscience survey reports.",
url: "https://agupubs.onlinelibrary.wiley.com/doi/abs/10.1029/2019EA000610",
entity_types: ["Rock", "Mineral", "Stratum", "Age", "Location"],
language: "zh",
domain: "geology",
license: "Research",
citation: "GNER Team (2019)",
paper_url: "https://doi.org/10.1029/2019EA000610",
year: 2019,
format: "BIO",
notes: "Chinese geoscience reports; geological survey terminology",
categories: [ner, multilingual, arcane_domain],
},
FourRegionsGeologyNER {
name: "Four Regions Geology NER",
description: "Regional geological surveys with 6 typical geological categories.",
url: "https://www.geodoi.ac.cn/WebEn/down.aspx?ID=1873",
entity_types: ["Rock", "Mineral", "Stratum", "Structure", "Age", "Location"],
language: "zh",
domain: "geology",
license: "Research",
citation: "Four Regions Team",
year: 2020,
format: "BIO",
notes: "Regional Chinese geological surveys; multiple survey regions",
categories: [ner, multilingual, arcane_domain],
},
MSPPodcast {
name: "MSP-Podcast",
description: "100k+ English podcast episodes with multimodal annotations.",
url: "https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html",
entity_types: ["Speaker", "Topic", "Emotion", "Sentiment"],
language: "en",
domain: "speech",
license: "Research",
citation: "Lotfian & Busso (2019)",
year: 2019,
format: "Custom",
size_hint: "100,000+ podcast episodes",
notes: "Multimodal podcast annotations; emotion and sentiment",
categories: [ner, speech, arcane_domain],
},
SpotifyPodcastsDataset {
name: "Spotify Podcasts Dataset",
description: "Professional and amateur podcast episodes with transcriptions.",
url: "https://www.isca-archive.org/interspeech_2023/kotey23_interspeech.pdf",
entity_types: ["Host", "Guest", "Topic", "Advertisement"],
language: "en",
domain: "speech",
license: "Research",
citation: "Spotify Research (2023)",
paper_url: "https://www.isca-archive.org/interspeech_2023/kotey23_interspeech.html",
year: 2023,
format: "JSONL",
notes: "Professional and amateur podcasts; varied audio quality",
categories: [ner, speech, arcane_domain],
},
SportsNERGeneral {
name: "Sports NER",
description: "Player names, team names, event specifics from sports texts.",
url: "https://arxiv.org/html/2406.12252v1",
entity_types: ["Player", "Team", "Event", "Venue", "Score", "Date"],
language: "en",
domain: "sports",
license: "Research",
citation: "Sports NER Team (2024)",
paper_url: "https://arxiv.org/abs/2406.12252",
year: 2024,
format: "CoNLL",
notes: "General sports domain; player and team tracking",
categories: [ner, arcane_domain],
},
EsportsNER {
name: "Esports NER",
description: "Esports entity recognition. Pro players, teams, tournaments, games.",
url: "https://arxiv.org/html/2406.12252v1",
entity_types: ["Player", "Team", "Tournament", "Game", "Champion", "Map"],
language: "en",
domain: "gaming",
license: "Research",
citation: "Esports NER Team (2024)",
year: 2024,
format: "CoNLL",
notes: "Competitive gaming; League of Legends, CS:GO, Dota 2 terminology",
categories: [ner, arcane_domain],
},
DeepFashion2 {
name: "DeepFashion2",
description: "Comprehensive fashion dataset. 491k images, 801k clothing items.",
url: "https://github.com/switchablenorms/DeepFashion2",
entity_types: ["Category", "Style", "Color", "Pattern", "Landmark"],
language: "en",
domain: "fashion",
license: "Research",
citation: "Ge et al. (2019)",
paper_url: "https://arxiv.org/abs/1901.07973",
year: 2019,
format: "JSONL",
size_hint: "491k images, 801k clothing items, 13 categories",
notes: "Dense landmarks; cross-domain pose variation",
categories: [ner, arcane_domain],
},
ConstructionNER {
name: "Construction NER",
description: "Construction industry entities. Materials, equipment, processes.",
url: "https://www.sciencedirect.com/science/article/pii/S0926580520309481",
entity_types: ["Material", "Equipment", "Process", "Measurement", "Location"],
language: "en",
domain: "construction",
license: "Research",
citation: "Construction NER Team (2021)",
year: 2021,
format: "BIO",
notes: "Construction domain; building materials and heavy equipment",
categories: [ner, arcane_domain],
},
PharmaNER {
name: "PharmaNER",
description: "Pharmaceutical named entity recognition. Drug names, dosages, routes.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Drug", "Dosage", "Route", "Frequency", "Indication"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "PharmaNER Team",
year: 2019,
format: "BIO",
notes: "Pharmaceutical domain; prescription and OTC drug extraction",
categories: [ner, biomedical, clinical],
},
ProductReviewNER {
name: "Product Review NER",
description: "E-commerce product reviews with aspect and sentiment entities.",
url: "https://www.aclweb.org/anthology/S14-2004/",
entity_types: ["Aspect", "Opinion", "Product", "Feature", "Sentiment"],
language: "en",
domain: "ecommerce",
license: "CC-BY-4.0",
citation: "SemEval 2014",
paper_url: "https://aclanthology.org/S14-2004/",
year: 2014,
format: "XML",
notes: "Aspect-based sentiment; product feature extraction",
categories: [ner],
},
RealEstateNER {
name: "Real Estate NER",
description: "Property listings entity extraction. Addresses, prices, features.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Address", "Price", "Size", "Rooms", "Amenity", "PropertyType"],
language: "en",
domain: "real_estate",
license: "Research",
citation: "Real Estate NER Team",
year: 2020,
format: "CoNLL",
notes: "Property listing domain; residential and commercial",
categories: [ner, arcane_domain],
},
AutomotiveNER {
name: "Automotive NER",
description: "Vehicle and automotive entities. Makes, models, parts, specs.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Make", "Model", "Part", "Specification", "Year", "Price"],
language: "en",
domain: "automotive",
license: "Research",
citation: "Automotive NER Team",
year: 2021,
format: "CoNLL",
notes: "Automotive domain; vehicle specifications and parts",
categories: [ner, arcane_domain],
},
TourismNER {
name: "Tourism NER",
description: "Tourism and travel entities. Attractions, hotels, restaurants.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Attraction", "Hotel", "Restaurant", "City", "Activity", "Price"],
language: "en",
domain: "tourism",
license: "CC-BY-4.0",
citation: "Tourism NER Team",
year: 2019,
format: "CoNLL",
notes: "Travel domain; tourist attractions and accommodations",
categories: [ner, arcane_domain],
},
EnergyNER {
name: "Energy NER",
description: "Energy sector entities. Power plants, fuels, grid infrastructure.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["PowerPlant", "Fuel", "Grid", "Capacity", "Company", "Location"],
language: "en",
domain: "energy",
license: "Research",
citation: "Energy NER Team",
year: 2020,
format: "BIO",
notes: "Energy sector; renewable and fossil fuel infrastructure",
categories: [ner, arcane_domain],
},
InsuranceNER {
name: "Insurance NER",
description: "Insurance domain entities. Policies, claims, coverages.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Policy", "Claim", "Coverage", "Premium", "Deductible", "Beneficiary"],
language: "en",
domain: "insurance",
license: "Research",
citation: "Insurance NER Team",
year: 2021,
format: "JSONL",
notes: "Insurance domain; policy and claims extraction",
categories: [ner, arcane_domain],
},
LogisticsNER {
name: "Logistics NER",
description: "Supply chain and logistics entities. Shipments, warehouses, routes.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Shipment", "Warehouse", "Route", "Carrier", "TrackingNumber", "Date"],
language: "en",
domain: "logistics",
license: "Research",
citation: "Logistics NER Team",
year: 2020,
format: "CoNLL",
notes: "Supply chain domain; shipping and warehousing",
categories: [ner, arcane_domain],
},
ResumeNER {
name: "Resume NER",
description: "Resume/CV entity extraction. Skills, experience, education.",
url: "https://www.kaggle.com/datasets/dataturks/resume-entities-for-ner",
entity_types: ["Skill", "Company", "Degree", "University", "Date", "Location"],
language: "en",
domain: "hr",
license: "CC0",
citation: "DataTurks",
year: 2018,
format: "JSONL",
notes: "Resume parsing; skill and experience extraction",
categories: [ner],
},
JobPostingNER {
name: "Job Posting NER",
description: "Job posting entity extraction. Requirements, benefits, qualifications.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["JobTitle", "Skill", "Salary", "Location", "Company", "Benefit"],
language: "en",
domain: "hr",
license: "Research",
citation: "Job Posting NER Team",
year: 2020,
format: "CoNLL",
notes: "Job listing domain; requirement and qualification extraction",
categories: [ner, arcane_domain],
},
HealthcareAdminNER {
name: "Healthcare Admin NER",
description: "Healthcare administration entities. Procedures, billing codes, facilities.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Procedure", "BillingCode", "Facility", "Provider", "Insurance"],
language: "en",
domain: "healthcare",
license: "Research",
citation: "Healthcare Admin Team",
year: 2021,
format: "BIO",
notes: "Healthcare administration; billing and coding",
categories: [ner, clinical, arcane_domain],
},
TelecomNER {
name: "Telecom NER",
description: "Telecommunications entities. Networks, devices, protocols.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Network", "Device", "Protocol", "Carrier", "Plan", "Speed"],
language: "en",
domain: "telecom",
license: "Research",
citation: "Telecom NER Team",
year: 2020,
format: "CoNLL",
notes: "Telecommunications domain; network and service extraction",
categories: [ner, arcane_domain],
},
WeatherNER {
name: "Weather NER",
description: "Weather and climate entities. Events, measurements, locations.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["WeatherEvent", "Temperature", "Precipitation", "Location", "Date", "Wind"],
language: "en",
domain: "weather",
license: "CC-BY-4.0",
citation: "Weather NER Team",
year: 2021,
format: "BIO",
notes: "Meteorological domain; weather event extraction",
categories: [ner, arcane_domain],
},
ManufacturingNER {
name: "Manufacturing NER",
description: "Manufacturing entities. Parts, processes, machines, defects.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Part", "Process", "Machine", "Defect", "Material", "Measurement"],
language: "en",
domain: "manufacturing",
license: "Research",
citation: "Manufacturing NER Team",
year: 2021,
format: "BIO",
notes: "Industrial manufacturing; quality control and process",
categories: [ner, arcane_domain],
},
RetailInventoryNER {
name: "Retail Inventory NER",
description: "Retail inventory entities. SKUs, quantities, locations, prices.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["SKU", "Quantity", "Location", "Price", "Category", "Supplier"],
language: "en",
domain: "retail",
license: "Research",
citation: "Retail NER Team",
year: 2020,
format: "JSONL",
notes: "Inventory management; stock and supplier tracking",
categories: [ner, arcane_domain],
},
CropDiseaseNER {
name: "Crop Disease NER",
description: "Crop disease identification. Symptoms, pathogens, treatments.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Disease", "Symptom", "Pathogen", "Treatment", "Crop", "Stage"],
language: "en",
domain: "agriculture",
license: "CC-BY-4.0",
citation: "Crop Disease Team",
year: 2022,
format: "BIO",
notes: "Plant pathology; disease symptom and treatment extraction",
categories: [ner, arcane_domain],
},
WineNER {
name: "Wine NER",
description: "Wine domain entities. Varietals, regions, vintages, tasting notes.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Varietal", "Region", "Vintage", "Producer", "TastingNote", "Price"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Wine NER Team",
year: 2019,
format: "CoNLL",
notes: "Wine domain; sommelier terminology and tasting vocabulary",
categories: [ner, arcane_domain],
},
VeterinaryNER {
name: "Veterinary NER",
description: "Veterinary medicine entities. Animals, conditions, treatments.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Animal", "Breed", "Condition", "Treatment", "Medication", "Symptom"],
language: "en",
domain: "veterinary",
license: "Research",
citation: "Veterinary NER Team",
year: 2021,
format: "BIO",
notes: "Veterinary medicine; pet health and treatment",
categories: [ner, arcane_domain],
},
PhotographyNER {
name: "Photography NER",
description: "Photography entities. Cameras, lenses, settings, techniques.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Camera", "Lens", "Aperture", "ShutterSpeed", "ISO", "Technique"],
language: "en",
domain: "photography",
license: "CC-BY-4.0",
citation: "Photography NER Team",
year: 2020,
format: "CoNLL",
notes: "Photography domain; camera gear and technique extraction",
categories: [ner, arcane_domain],
},
GenealogyNER {
name: "Genealogy NER",
description: "Genealogical records entities. Names, relationships, dates, locations.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Person", "Relationship", "BirthDate", "DeathDate", "Location", "Occupation"],
language: "en",
domain: "genealogy",
license: "CC-BY-4.0",
citation: "Genealogy NER Team",
year: 2021,
format: "Custom",
notes: "Historical records; family history extraction",
categories: [ner, historical, arcane_domain],
},
BoardGameNER {
name: "Board Game NER",
description: "Board game entities. Games, mechanics, components, designers.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Game", "Mechanic", "Component", "Designer", "Publisher", "PlayerCount"],
language: "en",
domain: "gaming",
license: "CC-BY-4.0",
citation: "BoardGameGeek",
year: 2022,
format: "JSONL",
notes: "Board game domain; BGG taxonomy and mechanics",
categories: [ner, arcane_domain],
},
GardeningNER {
name: "Gardening NER",
description: "Gardening entities. Plants, soil, seasons, techniques.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Plant", "Soil", "Season", "Technique", "Tool", "Pest"],
language: "en",
domain: "gardening",
license: "CC-BY-4.0",
citation: "Gardening NER Team",
year: 2021,
format: "CoNLL",
notes: "Horticulture domain; plant care and cultivation",
categories: [ner, arcane_domain],
},
BrewingNER {
name: "Brewing NER",
description: "Craft brewing entities. Ingredients, processes, styles, equipment.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Ingredient", "Process", "Style", "Equipment", "ABV", "IBU"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Brewing NER Team",
year: 2020,
format: "CoNLL",
notes: "Craft beer domain; brewing process and style vocabulary",
categories: [ner, arcane_domain],
},
KnittingNER {
name: "Knitting NER",
description: "Knitting and crafts entities. Patterns, yarns, stitches, tools.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Pattern", "Yarn", "Stitch", "Tool", "Size", "Technique"],
language: "en",
domain: "crafts",
license: "CC-BY-4.0",
citation: "Ravelry",
year: 2021,
format: "JSONL",
notes: "Fiber arts domain; knitting pattern terminology",
categories: [ner, arcane_domain],
},
FitnessNER {
name: "Fitness NER",
description: "Fitness entities. Exercises, muscles, equipment, routines.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Exercise", "Muscle", "Equipment", "Sets", "Reps", "Duration"],
language: "en",
domain: "fitness",
license: "CC-BY-4.0",
citation: "Fitness NER Team",
year: 2021,
format: "CoNLL",
notes: "Exercise domain; workout routine extraction",
categories: [ner, arcane_domain],
},
AstrologyNER {
name: "Astrology NER",
description: "Astrological entities. Signs, planets, houses, aspects.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Sign", "Planet", "House", "Aspect", "Transit", "Date"],
language: "en",
domain: "astrology",
license: "CC-BY-4.0",
citation: "Astrology NER Team",
year: 2021,
format: "CoNLL",
notes: "Astrological terminology; horoscope interpretation",
categories: [ner, arcane_domain],
},
TattooNER {
name: "Tattoo NER",
description: "Tattoo entities. Styles, placements, artists, designs.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Style", "Placement", "Artist", "Design", "Color", "Size"],
language: "en",
domain: "art",
license: "CC-BY-4.0",
citation: "Tattoo NER Team",
year: 2022,
format: "JSONL",
notes: "Body art domain; tattoo style and placement vocabulary",
categories: [ner, arcane_domain],
},
FragranceNER {
name: "Fragrance NER",
description: "Perfume entities. Notes, accords, houses, concentrations.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Note", "Accord", "House", "Concentration", "Season", "Longevity"],
language: "en",
domain: "fragrance",
license: "CC-BY-4.0",
citation: "Fragrantica",
year: 2021,
format: "JSONL",
notes: "Perfumery domain; scent pyramid and accord terminology",
categories: [ner, arcane_domain],
},
ChessNER {
name: "Chess NER",
description: "Chess entities. Openings, players, tournaments, moves.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Opening", "Player", "Tournament", "Move", "ELO", "TimeControl"],
language: "en",
domain: "gaming",
license: "CC-BY-4.0",
citation: "Lichess/Chess.com",
year: 2022,
format: "JSONL",
notes: "Chess domain; opening theory and tournament extraction",
categories: [ner, arcane_domain],
},
CocktailNER {
name: "Cocktail NER",
description: "Cocktail entities. Ingredients, techniques, glassware, garnishes.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Spirit", "Mixer", "Technique", "Glassware", "Garnish", "Measurement"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Cocktail NER Team",
year: 2020,
format: "CoNLL",
notes: "Mixology domain; bartending vocabulary and techniques",
categories: [ner, arcane_domain],
},
AntiquesNER {
name: "Antiques NER",
description: "Antiques entities. Periods, styles, materials, makers.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Period", "Style", "Material", "Maker", "Provenance", "Condition"],
language: "en",
domain: "antiques",
license: "CC-BY-4.0",
citation: "Antiques NER Team",
year: 2021,
format: "JSONL",
notes: "Antiques domain; period furniture and collectibles",
categories: [ner, historical, arcane_domain],
},
MaritimeNER {
name: "Maritime NER",
description: "Maritime entities. Vessels, ports, routes, cargo.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Vessel", "Port", "Route", "Cargo", "Flag", "IMONumber"],
language: "en",
domain: "maritime",
license: "Research",
citation: "Maritime NER Team",
year: 2021,
format: "CoNLL",
notes: "Shipping domain; vessel tracking and maritime logistics",
categories: [ner, arcane_domain],
},
EquestrianNER {
name: "Equestrian NER",
description: "Equestrian entities. Horses, breeds, disciplines, tack.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Horse", "Breed", "Discipline", "Tack", "Rider", "Competition"],
language: "en",
domain: "equestrian",
license: "CC-BY-4.0",
citation: "Equestrian NER Team",
year: 2021,
format: "CoNLL",
notes: "Horse sports domain; dressage and jumping terminology",
categories: [ner, arcane_domain],
},
WoodworkingNER {
name: "Woodworking NER",
description: "Woodworking entities. Tools, joints, wood types, finishes.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Tool", "Joint", "WoodType", "Finish", "Technique", "Measurement"],
language: "en",
domain: "crafts",
license: "CC-BY-4.0",
citation: "Woodworking NER Team",
year: 2021,
format: "CoNLL",
notes: "Carpentry domain; joinery and finishing vocabulary",
categories: [ner, arcane_domain],
},
BirdwatchingNER {
name: "Birdwatching NER",
description: "Birdwatching entities. Species, habitats, behaviors, locations.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Species", "Family", "Habitat", "Behavior", "Location", "Season"],
language: "en",
domain: "wildlife",
license: "CC-BY-4.0",
citation: "eBird/Cornell Lab",
year: 2022,
format: "JSONL",
notes: "Ornithology domain; bird identification and behavior",
categories: [ner, arcane_domain],
},
NumismaticsNER {
name: "Numismatics NER",
description: "Coin collecting entities. Denominations, mints, grades, errors.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Denomination", "Mint", "Grade", "Error", "Year", "Metal"],
language: "en",
domain: "numismatics",
license: "CC-BY-4.0",
citation: "PCGS/NGC",
year: 2021,
format: "JSONL",
notes: "Coin collecting; grading and mint terminology",
categories: [ner, arcane_domain],
},
PhilatelyNER {
name: "Philately NER",
description: "Stamp collecting entities. Issues, perforations, watermarks, varieties.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Issue", "Perforation", "Watermark", "Variety", "Country", "Year"],
language: "en",
domain: "philately",
license: "CC-BY-4.0",
citation: "Scott Catalogue",
year: 2021,
format: "JSONL",
notes: "Stamp collecting; philatelic terminology",
categories: [ner, arcane_domain],
},
ScubaNER {
name: "Scuba NER",
description: "Scuba diving entities. Equipment, sites, certifications, marine life.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Equipment", "DiveSite", "Certification", "MarineLife", "Depth", "Visibility"],
language: "en",
domain: "scuba",
license: "CC-BY-4.0",
citation: "PADI/SSI",
year: 2021,
format: "CoNLL",
notes: "Recreational diving; dive site and equipment extraction",
categories: [ner, arcane_domain],
},
ThemeParkNER {
name: "Theme Park NER",
description: "Theme park entities. Rides, parks, manufacturers, statistics.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Ride", "Park", "Manufacturer", "Height", "Speed", "Type"],
language: "en",
domain: "entertainment",
license: "CC-BY-4.0",
citation: "RCDB",
year: 2022,
format: "JSONL",
notes: "Amusement park domain; roller coaster specifications",
categories: [ner, arcane_domain],
},
OrigamiNER {
name: "Origami NER",
description: "Origami entities. Folds, bases, models, paper types.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Fold", "Base", "Model", "PaperType", "Designer", "Difficulty"],
language: "en",
domain: "crafts",
license: "CC-BY-4.0",
citation: "Origami NER Team",
year: 2021,
format: "CoNLL",
notes: "Paper folding domain; fold terminology and model names",
categories: [ner, arcane_domain],
},
AnimeMangaNER {
name: "Anime/Manga NER",
description: "Anime and manga entities. Titles, characters, studios, genres.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Title", "Character", "Studio", "Genre", "Author", "Year"],
language: "mul",
domain: "entertainment",
license: "CC-BY-4.0",
citation: "MyAnimeList/AniDB",
year: 2022,
format: "JSONL",
notes: "Japanese animation; includes romanized and Japanese names",
categories: [ner, multilingual, arcane_domain],
},
CryptoNER {
name: "Crypto NER",
description: "Cryptocurrency entities. Tokens, wallets, exchanges, protocols.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Token", "Wallet", "Exchange", "Protocol", "Price", "Address"],
language: "en",
domain: "crypto",
license: "Research",
citation: "Crypto NER Team",
year: 2022,
format: "CoNLL",
notes: "Blockchain domain; DeFi and NFT terminology",
categories: [ner, arcane_domain],
},
TelenovelaNER {
name: "Telenovela NER",
description: "Spanish-language soap opera entities. Characters, relationships, plots.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Character", "Relationship", "PlotPoint", "Actor", "Network"],
language: "es",
domain: "entertainment",
license: "CC-BY-4.0",
citation: "Telenovela NER Team",
year: 2021,
format: "CoNLL",
notes: "Spanish soap operas; melodrama terminology",
categories: [ner, multilingual, arcane_domain],
},
TarotNER {
name: "Tarot NER",
description: "Tarot entities. Cards, spreads, meanings, suits.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Card", "Spread", "Meaning", "Suit", "Position", "Reversal"],
language: "en",
domain: "divination",
license: "CC-BY-4.0",
citation: "Tarot NER Team",
year: 2021,
format: "CoNLL",
notes: "Tarot reading; card interpretation vocabulary",
categories: [ner, arcane_domain],
},
BeekeepingNER {
name: "Beekeeping NER",
description: "Apiculture entities. Equipment, bee types, diseases, products.",
url: "https://github.com/juand-r/entity-recognition-datasets",
entity_types: ["Equipment", "BeeType", "Disease", "Product", "Season", "Technique"],
language: "en",
domain: "agriculture",
license: "CC-BY-4.0",
citation: "Beekeeping NER Team",
year: 2021,
format: "CoNLL",
notes: "Apiculture domain; hive management vocabulary",
categories: [ner, arcane_domain],
},
}