DocRED {
name: "DocRED",
description: "Document-level relation extraction. 96 relation types from Wikipedia.",
url: "https://raw.githubusercontent.com/mainlp/CrossRE/main/crossre_data/ai-test.json",
entity_types: ["PER", "LOC", "ORG", "TIME", "NUM"],
language: "en",
domain: "wikipedia",
license: "MIT",
citation: "Yao et al. (2019)",
paper_url: "https://aclanthology.org/P19-1074/",
year: 2019,
format: "JSONL",
size_hint: "5,053 docs, 132k entities, 56k relations",
splits: ["train", "dev", "test"],
tasks: ["re"],
expected_docs: 5053,
hf_id: "docred",
categories: [relation_extraction],
},
ReTACRED {
name: "Re-TACRED",
description: "Large-scale relation extraction. 41 relation types + no_relation. Cleaned TACRED.",
url: "",
entity_types: ["PER", "ORG", "LOC", "DATE", "NUM"],
language: "en",
domain: "news",
license: "LDC",
citation: "Stoica et al. (2021)",
paper_url: "https://aclanthology.org/2021.acl-long.359/",
year: 2021,
format: "JSONL",
size_hint: "~106k relations",
notes: "Cleaned version of TACRED with relabeling; requires original TACRED",
access_status: DependsOnOther,
categories: [relation_extraction],
},
ACE2004 {
name: "ACE 2004",
description: "Nested entity recognition benchmark. Influential early corpus for nested NER research.",
url: "", entity_types: ["PER", "ORG", "GPE", "LOC", "FAC", "WEA", "VEH"],
language: "en",
domain: "news",
license: "LDC",
citation: "Doddington et al. (2004)",
paper_url: "https://aclanthology.org/L04-1011/",
year: 2004,
format: "XML",
annotation_scheme: "Standoff",
notes: "Requires LDC license; includes entity relations; includes some nested entities",
access_status: Registration,
categories: [ner, nested_ner],
},
CADEC {
name: "CADEC",
description: "Clinical Adverse Drug Events. Benchmark for discontinuous NER from AskaPatient.",
url: "https://huggingface.co/datasets/KevinSpaghetti/cadec",
entity_types: ["ADR", "Drug", "Disease", "Symptom"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Karimi et al. (2015)",
paper_url: "https://pubmed.ncbi.nlm.nih.gov/25817970/",
year: 2015,
format: "BRAT",
annotation_scheme: "Standoff",
size_hint: "~1,250 posts",
example: "'severe [pain]...in my [legs]' -> ADR spans [0:10, 20:24] (discontinuous)",
notes: "Discontinuous spans common; requires special handling; patient-written text",
splits: ["train", "dev", "test"],
tasks: ["ner", "discontinuous_ner"],
hf_id: "KevinSpaghetti/cadec",
categories: [ner, biomedical, discontinuous_ner],
},
WinoQueer {
name: "WinoQueer",
description: "Anti-LGBTQ+ bias benchmark. Community-in-the-loop design for queer representation.",
url: "https://raw.githubusercontent.com/katyfelkner/winoqueer/main/data/winoqueer_final.csv",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "MIT",
citation: "Felkner et al. (2023)",
paper_url: "https://aclanthology.org/2023.acl-long.507/",
year: 2023,
format: "CSV",
size_hint: "45,540 sentence pairs, ~4.8MB",
notes: "Community-designed; tests queer stereotypes in LLMs; Winograd-schema style",
splits: ["all"],
tasks: ["bias_evaluation"],
categories: [bias_evaluation],
},
BBQ {
name: "BBQ",
description: "Bias Benchmark for QA. Tests 9 social bias categories including sexual orientation.",
url: "https://raw.githubusercontent.com/nyu-mll/BBQ/main/data/Gender_identity.jsonl",
entity_types: [],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "Parrish et al. (2022)",
paper_url: "https://aclanthology.org/2022.findings-acl.165/",
year: 2022,
format: "JSONL",
size_hint: "~58k QA pairs across 11 categories",
notes: "Hand-built ambiguous contexts; age, disability, nationality, religion, etc.",
splits: ["all"],
tasks: ["bias_evaluation", "qa"],
categories: [bias_evaluation],
},
GICoref {
name: "GICoref",
description: "Gender-inclusive coreference. Written by/about trans and non-binary individuals.",
url: "https://raw.githubusercontent.com/TristaCao/into_inclusivecoref/master/GICoref/coref.combo.conll",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "Cao & Daume III (2020)",
paper_url: "https://aclanthology.org/2020.acl-main.418/",
year: 2020,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "95 docs, 470KB",
notes: "Includes neopronouns (ze/hir, xe/xem); singular they; first gender-inclusive coref corpus",
splits: ["all"],
tasks: ["coref"],
categories: [coref, bias_evaluation],
},
CorefUD {
name: "CorefUD",
description: "Multilingual coreference (17 languages, 22 datasets). CRAC shared task standard.",
url: "http://hdl.handle.net/11234/1-5987",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "general",
license: "CC-BY-NC-SA-4.0",
citation: "Nedoluzhko et al. (2022)",
paper_url: "https://aclanthology.org/2022.lrec-1.581/",
year: 2022,
format: "CoNLLU",
annotation_scheme: "CoNLLCoref",
notes: "CRAC shared task standard; includes zero anaphora; harmonized across treebanks",
categories: [coref, multilingual],
},
TransMuCoRes {
name: "TransMuCoRes",
description: "Coreference in 31 South Asian languages. Silver annotations via NLLB-200 translation.",
url: "", entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "general",
license: "Research",
citation: "Verma et al. (2024)",
paper_url: "https://arxiv.org/abs/2402.13571",
year: 2024,
format: "JSONL",
notes: "Silver annotations via translation; fine-tuned mBERT models available",
access_status: ContactAuthors,
categories: [coref, multilingual],
},
MGAP {
name: "mGAP",
description: "Multilingual Gender-Ambiguous Pronouns. 27 South Asian languages.",
url: "", entity_types: ["PER"],
language: "mul",
domain: "evaluation",
license: "Research",
citation: "Verma et al. (2025)",
paper_url: "https://aclanthology.org/2025.chipsal-1.10/",
year: 2025,
format: "TSV",
size_hint: "8,908 pronoun-name pairs",
notes: "Cross-attention improves results; extension of GAP to South Asian languages",
access_status: ContactAuthors,
categories: [coref, multilingual, bias_evaluation],
},
CrowSPairs {
name: "CrowS-Pairs",
description: "Crowdsourced stereotype pairs benchmark. 9 bias categories for language models.",
url: "https://github.com/nyu-mll/crows-pairs",
entity_types: [],
language: "en",
domain: "evaluation",
license: "CC-BY-SA-4.0",
citation: "Nangia et al. (2020)",
paper_url: "https://aclanthology.org/2020.emnlp-main.154/",
year: 2020,
format: "CSV",
size_hint: "~1.5k sentence pairs",
notes: "Tests stereotypical associations; gender, race, religion, age, nationality, etc.",
splits: ["all"],
tasks: ["bias_evaluation"],
categories: [bias_evaluation],
},
StereoSet {
name: "StereoSet",
description: "Measuring stereotypical bias in language models. 4 target domains.",
url: "https://github.com/moinnadeem/StereoSet",
entity_types: [],
language: "en",
domain: "evaluation",
license: "MIT",
citation: "Nadeem et al. (2020)",
paper_url: "https://aclanthology.org/2021.acl-long.416/",
year: 2020,
format: "JSONL",
size_hint: "~17k instances",
notes: "Intrasentence and intersentence evaluation; gender, profession, race, religion",
splits: ["all"],
tasks: ["bias_evaluation"],
categories: [bias_evaluation],
},
RealToxicityPrompts {
name: "RealToxicityPrompts",
description: "100k prompts for measuring toxicity generation in language models.",
url: "https://huggingface.co/datasets/allenai/real-toxicity-prompts",
entity_types: [],
language: "en",
domain: "evaluation",
license: "Apache-2.0",
citation: "Gehman et al. (2020)",
paper_url: "https://aclanthology.org/2020.findings-emnlp.301/",
year: 2020,
format: "JSONL",
size_hint: "~100k prompts",
notes: "Tests toxicity generation; perspectives API scores; diverse prompt styles",
splits: ["all"],
tasks: ["bias_evaluation"],
categories: [bias_evaluation, adversarial],
},
BoldBias {
name: "BOLD",
description: "Bias in Open-ended Language Generation Dataset. Wikipedia-based prompts.",
url: "https://github.com/amazon-science/bold",
entity_types: [],
language: "en",
domain: "evaluation",
license: "CC-BY-4.0",
citation: "Dhamala et al. (2021)",
paper_url: "https://aclanthology.org/2021.findings-acl.311/",
year: 2021,
format: "JSONL",
size_hint: "~23k prompts",
notes: "Tests generation bias; profession, gender, race, religion, political ideology",
splits: ["all"],
tasks: ["bias_evaluation"],
categories: [bias_evaluation],
},
DROC {
name: "DROC",
description: "German novel coreference. 90 German novels from DTA (Deutsches Textarchiv).",
url: "",
entity_types: ["PER"],
language: "de",
domain: "literature",
license: "CC-BY-4.0",
citation: "Krug et al. (2018)",
paper_url: "https://aclanthology.org/L18-1045/",
year: 2018,
format: "JSONL",
notes: "First public German literary coreference dataset. Prior URL is dead; needs a fresh mirror.",
splits: ["all"],
tasks: ["coref"],
access_status: Deprecated,
categories: [coref, literary],
},
FantasyCoref {
name: "FantasyCoref",
description: "Fantasy fiction coreference. Handles entity transformations. 211 Grimms' stories.",
url: "https://github.com/emorynlp/FantasyCoref/archive/refs/heads/master.zip",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "literature",
license: "Research",
citation: "Han et al. (2021)",
paper_url: "https://aclanthology.org/2021.emnlp-main.672/",
year: 2021,
format: "JSONL",
notes: "Shape-shifting, possession, disguise - unique challenges; data on GitHub",
tasks: ["coref"],
access_status: Public,
categories: [coref, literary],
},
BookCoref {
name: "BOOKCOREF",
description: "Book-scale coreference. First benchmark with 200k+ tokens/doc average. Character coreference on 53 Project Gutenberg novels.",
url: "https://huggingface.co/datasets/sapienzanlp/bookcoref",
entity_types: ["PER"],
language: "en",
domain: "literature",
license: "CC-BY-NC-SA-4.0",
citation: "Martinelli et al. (2025)",
paper_url: "https://aclanthology.org/2025.acl-long.1197/",
year: 2025,
format: "JSONL",
annotation_scheme: "CoNLLCoref",
size_hint: "53 books, ~10.8M tokens silver, 229k tokens gold",
example: "doc_key: pride_and_prejudice_1342\nsentences: [[CHAPTER, I.], [It, is, a, truth, ...]]\nclusters: [[[79,80], [81,82], ...], ...]\ncharacters: [{name: Mr Bennet, cluster: [[79,80]]}]",
notes: "Gold test set: 3 books (Animal Farm, Siddhartha, Pride & Prejudice). Silver train: 45 books. Unprecedented 73k avg mention distance. Requires HF datasets Python lib to download; export to JSONL for anno loader.",
splits: ["train", "validation", "test"],
tasks: ["coref"],
expected_docs: 53,
hf_id: "sapienzanlp/bookcoref",
access_status: DependsOnOther,
categories: [coref, literary, long_document],
},
BookCorefSplit {
name: "BOOKCOREF (Split)",
description: "BOOKCOREF split into 1500-token windows for comparison with standard benchmarks.",
url: "https://huggingface.co/datasets/sapienzanlp/bookcoref",
entity_types: ["PER"],
language: "en",
domain: "literature",
license: "CC-BY-NC-SA-4.0",
citation: "Martinelli et al. (2025)",
paper_url: "https://aclanthology.org/2025.acl-long.1197/",
year: 2025,
format: "JSONL",
annotation_scheme: "CoNLLCoref",
size_hint: "7544 train, 398 val, 152 test windows",
notes: "Same data as BOOKCOREF but windowed. Enables fair comparison: Maverickxl gets 82.2 CoNLL F1 on split vs 61.0 on full books. Requires HF datasets Python lib to download.",
splits: ["train", "validation", "test"],
tasks: ["coref"],
hf_id: "sapienzanlp/bookcoref",
hf_config: "split",
access_status: DependsOnOther,
categories: [coref, literary],
},
LongtoNotes {
name: "LongtoNotes",
description: "OntoNotes with merged coreference chains. Manually merges split OntoNotes documents back into full documents.",
url: "https://docs.google.com/forms/d/e/1FAIpQLScoWkBOgJ1HH_phtvTJ4_hGvQw6f0W6K7kw74sUKCDTG8P2iA/viewform",
entity_types: ["PER", "ORG", "GPE", "NORP"],
language: "en",
domain: "mixed",
license: "CC-BY-4.0",
citation: "Shridhar et al. (2023)",
paper_url: "https://aclanthology.org/2023.findings-eacl.105/",
year: 2023,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "2,415 documents, up to 8x longer than OntoNotes",
notes: "Requires OntoNotes access. Documents up to 8x OntoNotes length, 2x LitBank. Multi-genre (WSJ, broadcast, web).",
splits: ["train", "dev", "test"],
tasks: ["coref"],
categories: [coref, long_document],
},
MovieCoref {
name: "MovieCoref",
description: "Screenplay coreference. Character coreference in movie screenplays with unique structural challenges.",
url: "https://aclanthology.org/attachments/2021.findings-acl.176.OptionalSupplementaryMaterial.gz",
entity_types: ["PER"],
language: "en",
domain: "literature",
license: "Research",
citation: "Baruah et al. (2021)",
paper_url: "https://aclanthology.org/2021.findings-acl.176/",
year: 2021,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "9 screenplays (~22k tokens/doc avg), 6 full + 3 excerpts",
notes: "Screenplay structure (scene headings, character names, parentheticals) differs significantly from prose. Focus on character coreference only.",
splits: ["all"],
tasks: ["coref"],
categories: [coref, literary, long_document],
},
TwiConv {
name: "TwiConv",
description: "Twitter conversational coreference.",
url: "https://raw.githubusercontent.com/berfingit/TwiConv/main/conll_skeleton/001_940791133357199360.branch7._with_boundaries_gold_conll",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "social_media",
license: "Research",
citation: "Aktaş et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.835/",
format: "CoNLL",
notes: "Turn-taking dynamics; speaker grounding",
categories: [coref, dialogue, social_media],
},
MuDoCo {
name: "MuDoCo",
description: "Multi-domain document-level coreference. Dialog-based.",
url: "https://raw.githubusercontent.com/facebookresearch/mudoco/main/mudoco_calling.json",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "dialogue",
license: "MIT",
citation: "Raghunathan et al. (2020)",
paper_url: "https://arxiv.org/abs/2005.00816",
format: "JSON",
notes: "Multi-domain dialog coreference with speaker tracking",
categories: [coref, dialogue],
},
DialogRE {
name: "DialogRE",
description: "Dialogue-based relation extraction. Multi-turn conversations requiring entity tracking across turns.",
url: "https://github.com/nlpdata/dialogre",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "dialogue",
license: "CC-BY-NC-SA-4.0",
citation: "Yu et al. (2020)",
paper_url: "https://aclanthology.org/2020.acl-main.444/",
year: 2020,
format: "JSONL",
size_hint: "~1.8k dialogues, 36 relation types",
notes: "Based on Friends TV show transcripts; requires tracking entities across dialogue turns",
splits: ["train", "dev", "test"],
tasks: ["re", "coref"],
categories: [relation_extraction, dialogue],
},
MultiWOZNER {
name: "MultiWOZ NER",
description: "Multi-domain task-oriented dialogue with slot/entity annotations. Multi-turn conversations.",
url: "https://github.com/budzianowski/multiwoz",
entity_types: ["RESTAURANT", "HOTEL", "ATTRACTION", "TAXI", "TRAIN", "HOSPITAL", "POLICE"],
language: "en",
domain: "dialogue",
license: "Apache-2.0",
citation: "Budzianowski et al. (2018)",
paper_url: "https://aclanthology.org/D18-1547/",
year: 2018,
format: "JSONL",
size_hint: "~10k dialogues, 7 domains",
notes: "Standard benchmark for dialogue state tracking; slot values correspond to entities",
splits: ["train", "dev", "test"],
tasks: ["ner", "slot_filling"],
categories: [ner, dialogue],
},
CoQAEntities {
name: "CoQA",
description: "Conversational QA across 7 domains: children's stories, literature, mid/high school exams, news, Wikipedia, science, Reddit.",
url: "https://stanfordnlp.github.io/coqa/",
entity_types: ["ANSWER_SPAN"],
language: "en",
domain: "mixed",
license: "Research",
citation: "Reddy et al. (2019)",
paper_url: "https://aclanthology.org/Q19-1016/",
year: 2019,
format: "JSONL",
size_hint: "~8k conversations, ~127k QA turns",
notes: "Multi-turn QA; implicit entity tracking across conversation history; 7 diverse domains",
splits: ["train", "dev", "test"],
tasks: ["qa", "coref"],
categories: [coref, dialogue],
},
GVC {
name: "Gun Violence Corpus",
description: "Cross-document event coreference for gun violence. Tests domain transfer from ECB+.",
url: "https://github.com/cltl/GunViolenceCorpus",
entity_types: ["EVENT", "PARTICIPANT", "WEAPON", "LOCATION", "TIME"],
language: "en",
domain: "news",
license: "Research",
citation: "Vossen et al. (2018)",
paper_url: "https://aclanthology.org/L18-1182/",
year: 2018,
format: "Custom",
size_hint: "~500 docs, 510 mentions",
notes: "Domain-specific CDEC; requires participant/temporal reasoning unlike lemma-driven ECB+",
splits: ["train", "dev", "test"],
tasks: ["coref", "event_coref", "cdcr"],
categories: [coref, event_coref],
},
FCC {
name: "Football Coreference Corpus",
description: "Cross-document event coreference for football matches. Requires temporal reasoning.",
url: "", entity_types: ["EVENT", "PARTICIPANT", "LOC", "TIME"],
language: "en",
domain: "sports",
license: "Research",
citation: "Bugert et al. (2021)",
paper_url: "https://direct.mit.edu/coli/article/47/3/575/102774/",
notes: "Requires temporal reasoning about match dates",
access_status: ContactAuthors,
categories: [coref, event_coref],
},
ECBPlusMeta {
name: "ECB+META",
description: "ECB+ with metaphoric paraphrases. ChatGPT-transformed sentences.",
url: "", entity_types: ["EVENT", "TIME", "LOC", "PARTICIPANT"],
language: "en",
domain: "news",
license: "Research",
citation: "Pouran Ben Veyseh et al. (2024)",
paper_url: "https://arxiv.org/abs/2407.11988",
notes: "Adversarial; existing systems struggle badly",
access_status: ContactAuthors,
categories: [coref, event_coref, adversarial],
},
ARRAU {
name: "ARRAU 3.0 (v2)",
description: "Multi-genre anaphoric annotation: identity, bridging, discourse deixis, split antecedents.",
url: "", entity_types: ["PER", "LOC", "ORG", "EVENT"],
language: "en",
domain: "mixed",
license: "LDC + Research",
citation: "Poesio et al. (2024)",
paper_url: "https://aclanthology.org/2024.codi-1.12/",
year: 2024,
format: "MMAX2",
notes: "Most comprehensive anaphora resource; RST/TRAINS/Pear/GENIA subsets; LDC2023T05",
splits: ["train", "dev", "test"],
tasks: ["coref", "bridging", "discourse_deixis"],
access_status: Registration,
categories: [coref, abstract_anaphora],
},
ISNotes {
name: "ISNotes",
description: "Unrestricted bridging anaphora on OntoNotes. ~660 bridging pairs.",
url: "https://github.com/nlpAThits/ISNotes1.0/archive/refs/heads/master.zip",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "news",
license: "Research",
citation: "Hou et al. (2018)",
paper_url: "https://direct.mit.edu/coli/article/44/2/237/1596/",
format: "MMAX2",
notes: "Part-whole set-member bridging; requires OntoNotes for full text",
tasks: ["coref"],
access_status: DependsOnOther,
categories: [coref, abstract_anaphora],
},
ShellNouns {
name: "Shell Nouns (ASN)",
description: "Anaphoric shell noun resolution. 670 English shell nouns from Schmid taxonomy.",
url: "", entity_types: [],
language: "en",
domain: "academic",
license: "Research",
citation: "Kolhatkar & Hirst (2012)",
paper_url: "https://aclanthology.org/D12-1036/",
notes: "Factual, linguistic, mental, modal, eventive categories",
access_status: ContactAuthors,
categories: [abstract_anaphora],
},
PDTBv3 {
name: "PDTB 3.0",
description: "Penn Discourse TreeBank v3. 43 discourse relation types.",
url: "", entity_types: [],
language: "en",
domain: "news",
license: "LDC",
citation: "Prasad et al. (2019)",
paper_url: "https://catalog.ldc.upenn.edu/LDC2019T05",
notes: "Shallow discourse parsing; connective-argument pairs",
access_status: Registration,
categories: [abstract_anaphora],
},
CODICRACBridging {
name: "CODI-CRAC Bridging",
description: "Universal Anaphora bridging annotations. One of the largest bridging datasets.",
url: "https://github.com/UniversalAnaphora/UA-CODI-CRAC",
entity_types: ["BRIDGING_REF"],
language: "en",
domain: "dialogue",
license: "CC-BY-4.0",
citation: "CODI-CRAC (2022)",
paper_url: "https://aclanthology.org/2024.lrec-main.1484.pdf",
year: 2022,
format: "CoNLLUA",
size_hint: "AMI, LIGHT, PERSUASION subsets",
notes: "Dialogue-heavy; extensive bridging annotations; Universal Anaphora format",
splits: ["train", "dev", "test"],
tasks: ["coref", "bridging"],
categories: [coref, abstract_anaphora, dialogue],
},
AnaphoraAccessibility {
name: "Anaphora Accessibility",
description: "Discourse anaphora accessibility evaluation. Tests non-NP antecedents.",
url: "",
entity_types: ["DISCOURSE_DEIXIS", "EVENT_ANAPHORA", "CLAUSAL_ANTECEDENT"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "Accessibility Authors (2025)",
paper_url: "https://arxiv.org/html/2502.14119v1",
year: 2025,
format: "JSONL",
size_hint: "Controlled evaluation set",
notes: "2025 evaluation dataset; focuses on discourse-level anaphora understanding; non-nominal antecedents",
splits: ["test"],
tasks: ["coref", "discourse_deixis"],
access_status: NotYetReleased,
categories: [coref, abstract_anaphora],
},
HumanVoiceAgentInteraction {
name: "Human-Voice Agent Interaction",
description: "Naturalistic French dialogue from human-voice agent interactions. Response tokens, aside sequences, discourse deixis.",
url: "", entity_types: ["RESPONSE_TOKEN", "DISCOURSE_DEIXIS", "PROPOSITIONAL_ANAPHORA"],
language: "fr",
domain: "dialogue",
license: "Research",
citation: "Rudaz, Broth & Mlynář (2025)",
paper_url: "https://dl.acm.org/journal/tochi",
year: 2025,
format: "JSONL",
size_hint: "70 turns, 10 discourse deixis examples, 11 response tokens",
notes: "French dialogue from Pepper robot (2022) and ChatGPT voice mode (2025). Documents 'managed omnirelevance of speech' - how VAD-based agents misinterpret response tokens and asides. Local dataset at testdata/human_voice_agent/",
splits: ["all"],
tasks: ["coref", "abstract_anaphora"],
access_status: Local,
categories: [abstract_anaphora, dialogue],
},
DiscoBench {
name: "Disco-Bench",
description: "Discourse-aware evaluation benchmark for language modeling. 9 document-level testsets covering cohesion, coherence across Chinese/English literature.",
url: "",
entity_types: [],
language: "zh-en", domain: "literature",
license: "CC-BY-4.0",
citation: "Wang et al. (2023)",
paper_url: "https://arxiv.org/abs/2307.08074",
year: 2023,
format: "Custom",
size_hint: "9 document-level testsets + diagnostic suite",
notes: "Tests intra-sentence discourse properties: cohesion, coherence, entity tracking. Evaluates LLMs on discourse phenomena that cross sentences.",
splits: ["test"],
tasks: ["discourse_coherence"],
access_status: NotYetReleased,
categories: [abstract_anaphora, multilingual],
},
DiscoTrack {
name: "DiscoTrack",
description: "Multilingual LLM benchmark for discourse tracking. 12 languages, 4 levels: salience, entity tracking, discourse relations, bridging.",
url: "",
entity_types: ["SALIENT_ENTITY", "TRACKED_ENTITY", "BRIDGING_REF"],
language: "mul",
domain: "general",
license: "CC-BY-4.0",
citation: "Bu, Levine & Zeldes (2025)",
paper_url: "https://arxiv.org/abs/2510.17013",
year: 2025,
format: "Custom",
size_hint: "12 languages, 4 task levels",
notes: "Tests implicit information and pragmatic inference across documents. Four levels: salience recognition, entity tracking, discourse relations, bridging inference. State-of-the-art models still struggle.",
splits: ["test"],
tasks: ["coref", "bridging", "discourse_deixis"],
access_status: NotYetReleased,
categories: [coref, abstract_anaphora, multilingual],
},
LIEDER {
name: "LIEDER",
description: "Linguistically-Informed Evaluation for Discourse Entity Recognition. Tests existence, uniqueness, plurality, novelty.",
url: "",
entity_types: ["DISCOURSE_ENTITY"],
language: "en",
domain: "evaluation",
license: "Research",
citation: "Zhu & Frank (2024)",
paper_url: "https://arxiv.org/abs/2403.06301",
year: 2024,
format: "Custom",
size_hint: "Controlled evaluation set",
notes: "Tests LLM knowledge of semantic properties governing discourse entity introduction/reference: existence, uniqueness, plurality, novelty. Models show sensitivity to all except novelty.",
splits: ["test"],
tasks: ["discourse_deixis"],
access_status: ContactAuthors,
categories: [abstract_anaphora],
},
GCDC {
name: "GCDC",
description: "Grammarly Corpus of Discourse Coherence. Real-world texts with coherence ratings across 4 domains.",
url: "",
entity_types: [],
language: "en",
domain: "mixed",
license: "Research",
citation: "Lai & Tetreault (2018)",
paper_url: "https://arxiv.org/abs/1805.04993",
year: 2018,
format: "TSV",
size_hint: "~2000 texts, 4 domains",
notes: "Four domains: Clinton emails, Enron, Yahoo answers, Yelp. First large-scale discourse coherence evaluation.",
splits: ["train", "dev", "test"],
tasks: ["discourse_coherence"],
access_status: DependsOnOther,
depends_on: "Yahoo L6 corpus (request from Yahoo, then email tetreaul@gmail.com)",
categories: [abstract_anaphora],
},
DISAPERE {
name: "DISAPERE",
description: "Discourse Structure in Peer Review Discussions. 20k sentences in 506 review-rebuttal pairs with argumentation annotation.",
url: "https://github.com/nnkennard/DISAPERE",
entity_types: ["REVIEW_ARG", "REBUTTAL_ARG", "STANCE"],
language: "en",
domain: "scientific",
license: "CC-BY-4.0",
citation: "Kennard et al. (2022)",
paper_url: "https://arxiv.org/abs/2110.08520",
year: 2022,
format: "JSONL",
size_hint: "20k sentences, 506 review-rebuttal pairs",
notes: "Discourse relations between reviews and rebuttals. Fine-grained rebuttal annotation: context in review, stance toward arguments. Expert-annotated. DOWNLOADED to anno cache.",
splits: ["train", "dev", "test"],
tasks: ["discourse_relations"],
access_status: Public,
categories: [abstract_anaphora, dialogue],
},
PragmEval {
name: "PragmEval",
description: "Pragmatics-centered evaluation framework: 11 datasets covering discourse relations, speech acts, stance, sarcasm, verifiability.",
url: "https://github.com/sileod/pragmeval",
entity_types: [],
language: "en",
domain: "dialogue",
license: "Research",
citation: "Sileo et al. (2022)",
paper_url: "https://aclanthology.org/2022.lrec-1.255",
year: 2022,
format: "TSV",
size_hint: "20 subsets, ~130k examples total",
notes: "Compilation of PDTB, STAC, GUM (discourse relations), Emergent (stance), SarcasmV2, SwitchBoard/MRDA (speech acts), Verifiability, Persuasion, Squinky, EmoBank. DOWNLOADED to anno cache.",
splits: ["train", "dev", "test"],
tasks: ["discourse_relations", "speech_act_classification"],
hf_id: "pragmeval",
access_status: Public,
categories: [abstract_anaphora, dialogue],
},
DISRPT2025 {
name: "DISRPT 2025",
description: "Cross-formalism benchmark for discourse segmentation, connective detection, and relation classification. 39 corpora, 16 languages, 6 frameworks.",
url: "https://github.com/disrpt/sharedtask2025",
entity_types: ["DISCOURSE_UNIT", "CONNECTIVE", "DISCOURSE_RELATION"],
language: "mul",
domain: "general",
license: "Research",
citation: "DISRPT Organizers (2025)",
paper_url: "https://aclanthology.org/2025.disrpt-1.1.pdf",
year: 2025,
format: "CoNLL",
size_hint: "39 corpora, 5.1M tokens, 311k relations",
notes: "Unified 17-label relation scheme mapping 353 original labels. Includes RST, eRST, SDRT, PDTB, dependency, ISO frameworks. Czech, German, English, Basque, Persian, French, Italian, Dutch, Nigerian Pidgin, Polish, Portuguese, Russian, Spanish, Thai, Turkish, Chinese. EMNLP 2025 shared task.",
splits: ["train", "dev", "test"],
tasks: ["discourse_segmentation", "discourse_relations"],
access_status: Public,
categories: [abstract_anaphora, multilingual, dialogue],
},
AncientGreekUD {
name: "Ancient Greek UD",
description: "Universal Dependencies for Ancient Greek. Homeric through Byzantine.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Ancient_Greek-Perseus/master/grc_perseus-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "grc",
domain: "literature",
license: "CC-BY-NC-SA-3.0",
citation: "Celano et al. (2016)",
paper_url: "https://aclanthology.org/L16-1158/",
year: 2016,
format: "CoNLLU",
example: "# text = μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος\n1\tμῆνιν\tμῆνις\tNOUN\t_\t_\t2\tobj\t_\tO\n5\tἈχιλῆος\tἈχιλλεύς\tPROPN\t_\t_\t4\tnmod\t_\tB-PER",
notes: "Perseus treebank; spans 1500+ years of Greek; Homeric to Byzantine",
categories: [ner, ancient],
},
LatinUD {
name: "Latin UD",
description: "Universal Dependencies for Latin. Classical through Medieval.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Latin-ITTB/master/la_ittb-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "la",
domain: "literature",
license: "CC-BY-NC-SA-3.0",
citation: "Passarotti et al. (2017)",
paper_url: "https://aclanthology.org/W17-6526/",
format: "CoNLLU",
notes: "Index Thomisticus treebank; medieval scholastic",
categories: [ner, ancient],
},
CopticScriptorium {
name: "Coptic Scriptorium",
description: "Sahidic Coptic with multi-layer annotation. ~50k tokens.",
url: "https://data.copticscriptorium.org/",
entity_types: ["PER", "LOC", "ORG"],
language: "cop",
domain: "religious",
license: "CC-BY-4.0",
citation: "Zeldes & Schroeder (2016)",
paper_url: "https://aclanthology.org/L16-1313/",
format: "CoNLLU",
notes: "Multi-layer morphology/syntax/entities/coreference; requires ANNIS export",
access_status: Registration,
categories: [ner, ancient],
},
LT4HALA {
name: "LT4HALA Hebrew",
description: "Biblical Hebrew NER and coreference annotation.",
url: "", entity_types: ["PER", "LOC", "ORG", "GPE"],
language: "hbo",
domain: "religious",
license: "Research",
citation: "LREC-COLING 2024 LT4HALA Workshop",
paper_url: "https://lt4hala2024.github.io/",
notes: "First systematic biblical Hebrew NER+coref",
tasks: ["ner", "coref"],
access_status: ContactAuthors,
categories: [ner, coref, ancient],
},
ORACC {
name: "ORACC",
description: "Open Richly Annotated Cuneiform Corpus. Sumerian, Akkadian, Urartian.",
url: "http://oracc.museum.upenn.edu/",
entity_types: ["PER", "LOC", "ORG", "DIVINE"],
language: "akk",
domain: "historical",
license: "CC-BY-SA-3.0",
citation: "ORACC Project",
paper_url: "http://oracc.museum.upenn.edu/doc/about/index.html",
format: "JSON",
notes: "Cuneiform logographic+syllabic polyphony challenges; JSON export via API",
access_status: Registration,
categories: [ner, ancient],
},
MasakhaNER {
name: "MasakhaNER",
description: "NER for 10 African languages. PER/LOC/ORG/DATE.",
url: "https://raw.githubusercontent.com/masakhane-io/masakhane-ner/main/data/yor/test.txt",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "mul",
domain: "news",
license: "CC-BY-4.0",
citation: "Adelani et al. (2021)",
paper_url: "https://aclanthology.org/2021.tacl-1.66/",
year: 2021,
format: "CoNLL",
annotation_scheme: "BIO",
example: "Olúṣẹ́gun B-PER\nObásanjọ́ I-PER\nní O\nìlú O\nAbẹ́òkúta B-LOC\n. O",
notes: "Critically underrepresented languages; community-driven; tonal diacritics preserved",
hf_id: "masakhane/masakhaner",
categories: [ner, multilingual, low_resource, african_language],
},
MasakhaNER2 {
name: "MasakhaNER 2.0",
description: "Extended MasakhaNER with 20+ African languages.",
url: "https://huggingface.co/datasets/masakhane/masakhaner2",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "mul",
domain: "news",
license: "CC-BY-NC-4.0",
citation: "Adelani et al. (2022)",
paper_url: "https://aclanthology.org/2022.emnlp-main.298/",
year: 2022,
format: "CoNLL",
annotation_scheme: "BIO",
notes: "Extended to 20 languages; includes tonal languages",
splits: ["train", "dev", "test"],
tasks: ["ner"],
hf_id: "masakhane/masakhaner2",
categories: [ner, multilingual, low_resource, african_language],
},
AfriSenti {
name: "AfriSenti",
description: "Sentiment analysis for 14 African languages. 110k+ tweets. SemEval 2023 Task 12.",
url: "https://huggingface.co/datasets/shmuhammad/AfriSenti-twitter-sentiment",
entity_types: ["positive", "neutral", "negative"],
language: "mul",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Muhammad et al. (2023)",
paper_url: "https://aclanthology.org/2023.emnlp-main.862/",
year: 2023,
format: "HuggingFace",
size_hint: "~111k tweets",
example: "tweet: ይሄው ነው አይደል የእውቀትሽ ጥግ (Amharic)\nlabel: negative",
notes: "Amharic, Algerian/Moroccan Arabic, Hausa, Igbo, Kinyarwanda, Oromo, Nigerian Pidgin, Mozambican Portuguese, Swahili, Tigrinya, Xitsonga, Twi, Yoruba",
splits: ["train", "validation", "test"],
tasks: ["sentiment"],
hf_id: "shmuhammad/AfriSenti-twitter-sentiment",
categories: [multilingual, low_resource, social_media, african_language],
},
AfriQA {
name: "AfriQA",
description: "Cross-lingual QA for 10 African languages. Wikipedia-based.",
url: "https://huggingface.co/datasets/masakhane/afriqa",
entity_types: [], language: "mul",
domain: "wikipedia",
license: "CC-BY-4.0",
citation: "Ogundepo et al. (2023)",
paper_url: "https://aclanthology.org/2023.findings-emnlp.997/",
year: 2023,
format: "JSONL",
notes: "Cross-lingual retrieval QA; questions in African languages, passages in English/target",
splits: ["train", "dev", "test"],
tasks: ["qa"],
hf_id: "masakhane/afriqa",
categories: [multilingual, low_resource, african_language],
},
MasakhaNEWS {
name: "MasakhaNEWS",
description: "News topic classification for 16 African languages.",
url: "https://huggingface.co/datasets/masakhane/masakhanews",
entity_types: ["business", "entertainment", "health", "politics", "religion", "sports", "technology"],
language: "mul",
domain: "news",
license: "Apache-2.0",
citation: "Adelani et al. (2023)",
paper_url: "https://aclanthology.org/2023.acl-long.574/",
year: 2023,
format: "HuggingFace",
notes: "7 topics: business, entertainment, health, politics, religion, sports, technology",
splits: ["train", "dev", "test"],
tasks: ["text_classification"],
hf_id: "masakhane/masakhanews",
categories: [multilingual, low_resource, news, african_language],
},
AGNews {
name: "AG News",
description: "News article topic classification. 4 classes: World, Sports, Business, Sci/Tech.",
url: "https://huggingface.co/datasets/fancyzhx/ag_news",
entity_types: ["World", "Sports", "Business", "Sci/Tech"],
language: "en",
domain: "news",
license: "Non-commercial",
citation: "Zhang et al. (2015)",
paper_url: "https://arxiv.org/abs/1509.01626",
year: 2015,
format: "HuggingFace",
size_hint: "120k train, 7.6k test",
notes: "Character-level ConvNet paper; standard text classification benchmark",
splits: ["train", "test"],
tasks: ["text_classification"],
hf_id: "ag_news",
access_status: Public,
categories: [ner], },
DBPedia14 {
name: "DBPedia-14",
description: "Wikipedia article classification. 14 non-overlapping classes from DBpedia ontology.",
url: "https://huggingface.co/datasets/fancyzhx/dbpedia_14",
entity_types: ["Company", "EducationalInstitution", "Artist", "Athlete", "OfficeHolder", "MeanOfTransportation", "Building", "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film", "WrittenWork"],
language: "en",
domain: "wikipedia",
license: "CC-BY-SA-3.0",
citation: "Zhang et al. (2015)",
paper_url: "https://arxiv.org/abs/1509.01626",
year: 2015,
format: "HuggingFace",
size_hint: "560k train, 70k test",
notes: "14 classes: Company, EducationalInstitution, Artist, Athlete, OfficeHolder, MeanOfTransportation, Building, NaturalPlace, Village, Animal, Plant, Album, Film, WrittenWork",
splits: ["train", "test"],
tasks: ["text_classification"],
hf_id: "dbpedia_14",
access_status: Public,
categories: [ner, entity_linking], },
YahooAnswers {
name: "Yahoo Answers Topic",
description: "Question-answer topic classification. 10 classes covering diverse topics.",
url: "https://huggingface.co/datasets/community-datasets/yahoo_answers_topics",
entity_types: ["Society", "Science", "Health", "Education", "Computers", "Sports", "Business", "Entertainment", "Family", "Politics"],
language: "en",
domain: "qa",
license: "Non-commercial",
citation: "Zhang et al. (2015)",
paper_url: "https://arxiv.org/abs/1509.01626",
year: 2015,
format: "HuggingFace",
size_hint: "1.4M train, 60k test",
notes: "10 classes: Society, Science, Health, Education, Computers, Sports, Business, Entertainment, Family, Politics",
splits: ["train", "test"],
tasks: ["text_classification"],
hf_id: "community-datasets/yahoo_answers_topics",
access_status: Public,
categories: [dialogue],
},
TREC {
name: "TREC Question Classification",
description: "Question type classification. 6 coarse classes, 50 fine-grained types.",
url: "https://huggingface.co/datasets/trec",
entity_types: ["ABBR", "DESC", "ENTY", "HUM", "LOC", "NUM"],
language: "en",
domain: "qa",
license: "CC-BY-4.0",
citation: "Li & Roth (2002)",
paper_url: "https://www.aclweb.org/anthology/C02-1150/",
year: 2002,
format: "HuggingFace",
size_hint: "5.5k train, 500 test",
notes: "6 coarse: Abbreviation, Entity, Description, Human, Location, Numeric; 50 fine-grained",
splits: ["train", "test"],
tasks: ["text_classification"],
hf_id: "trec",
access_status: Public,
categories: [dialogue],
},
TweetTopic {
name: "TweetTopic",
description: "Multi-label topic classification for tweets. 6 domains, 19 topics.",
url: "https://huggingface.co/datasets/cardiffnlp/tweet_topic_multi",
entity_types: ["arts", "business", "daily_life", "pop_culture", "science", "sports"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Antypas et al. (2022)",
paper_url: "https://aclanthology.org/2022.coling-1.299/",
year: 2022,
format: "HuggingFace",
size_hint: "~11k tweets",
notes: "Multi-label; domains: arts, business, daily_life, pop_culture, science, sports; zero-shot benchmark",
splits: ["train", "validation", "test"],
tasks: ["text_classification"],
hf_id: "cardiffnlp/tweet_topic_multi",
access_status: Public,
categories: [social_media],
},
MasakhaPOS {
name: "MasakhaPOS",
description: "Part-of-speech tagging for 20 African languages.",
url: "https://github.com/masakhane-io/masakhane-pos",
entity_types: ["NOUN", "VERB", "ADJ", "ADV", "PRON", "PROPN", "ADP", "AUX", "CCONJ", "DET", "INTJ", "NUM", "PART", "PUNCT", "SCONJ", "SYM", "X"],
language: "mul",
domain: "general",
license: "MIT",
citation: "Dione et al. (2023)",
paper_url: "https://aclanthology.org/2023.acl-long.609/",
year: 2023,
format: "CoNLL-U",
annotation_scheme: "IOB2",
notes: "Universal Dependencies tagset; includes Bambara, Ewe, Mossi, Chichewa",
splits: ["train", "dev", "test"],
tasks: ["pos"],
hf_id: "masakhane/masakhane-pos",
categories: [multilingual, low_resource, african_language],
},
WikiANN {
name: "WikiANN",
description: "Silver-standard NER from Wikipedia hyperlinks. 282 languages.",
url: "https://huggingface.co/datasets/unimelb-nlp/wikiann",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Pan et al. (2017)",
paper_url: "https://aclanthology.org/P17-1178/",
example: "tokens: [Berlin, is, the, capital, of, Germany]\nner_tags: [B-LOC, O, O, O, O, B-LOC]",
notes: "Silver annotations; noisy but massive coverage",
hf_id: "unimelb-nlp/wikiann",
hf_config: "en",
access_status: HuggingFace,
categories: [ner, multilingual, low_resource],
},
NaijaNER {
name: "NaijaNER",
description: "Nigerian Pidgin NER corpus.",
url: "", entity_types: ["PER", "LOC", "ORG"],
language: "pcm",
domain: "social_media",
license: "Research",
citation: "Oyewusi et al. (2021)",
paper_url: "https://arxiv.org/abs/2102.05236",
notes: "Nigerian Pidgin English; code-mixing common",
access_status: ContactAuthors,
categories: [ner, low_resource],
},
WIESP2022NER {
name: "WIESP2022-NER (DEAL)",
description: "Astrophysics NER from NASA ADS. 31 entity types: facilities, wavelengths, telescopes, archives.",
url: "https://huggingface.co/datasets/adsabs/WIESP2022-NER",
entity_types: ["WAVELENGTH", "TELESCOPE", "FACILITY", "MODEL", "ARCHIVE", "DATASET", "MISSION"],
language: "en",
domain: "scientific",
license: "CC-BY-4.0",
citation: "Grezes et al. (2022)",
paper_url: "https://aclanthology.org/2022.wiesp-1.9/",
year: 2022,
format: "JSONL",
size_hint: "~3000 annotated abstracts",
notes: "AACL-IJCNLP 2022 WIESP shared task; NASA ADS astrophysics literature",
hf_id: "adsabs/WIESP2022-NER",
categories: [ner, arcane_domain],
},
DutchArchaeology {
name: "Dutch Archaeology NER",
description: "Archaeological excavation reports from DANS archive. 31k annotations across 6 entity types.",
url: "https://live.european-language-grid.eu/catalogue/corpus/13410",
entity_types: ["ARTEFACT", "PERIOD", "MATERIAL", "LOCATION", "SPECIES", "CONTEXT"],
language: "nl",
domain: "archaeology",
license: "CC-BY-4.0",
citation: "Brandsen et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.562/",
year: 2020,
format: "CoNLL",
size_hint: "~31k entity annotations, high IAA (0.95)",
notes: "Dutch grey literature; basis for ArcheoBERTje model",
categories: [ner, arcane_domain],
},
ENer {
name: "E-NER (EDGAR-NER)",
description: "NER for US SEC EDGAR filings. 52 documents, 400k+ tokens with legal entities.",
url: "https://raw.githubusercontent.com/terenceau1/E-NER-Dataset/main/all.csv",
entity_types: ["PERSON", "COURT", "BUSINESS", "GOVERNMENT", "LOCATION", "LEGISLATION"],
language: "en",
domain: "legal",
license: "GPL-3.0",
citation: "Au et al. (2022)",
paper_url: "https://aclanthology.org/2022.nllp-1.22/",
year: 2022,
format: "CSV",
size_hint: "52 SEC filings, 400k+ tokens",
notes: "10-K, 8-K, prospectuses; CSV token,tag format with BIO scheme",
tasks: ["ner"],
categories: [ner, arcane_domain],
},
FINER {
name: "FINER (Food Ingredients NER)",
description: "Food ingredient NER from AllRecipes. 182k sentences with ingredient phrases in IOB2 format.",
url: "https://figshare.com/ndownloader/files/36144501",
entity_types: ["INGREDIENT", "PRODUCT", "QUANTITY", "UNIT", "STATE"],
language: "en",
domain: "food",
license: "CC-BY-4.0",
citation: "Komariah et al. (2022)",
paper_url: "https://doi.org/10.6084/m9.figshare.20222361",
year: 2022,
format: "CoNLL",
annotation_scheme: "IOB2",
size_hint: "~182k sentences, ingredient phrases",
notes: "Semi-supervised multi-model construction from AllRecipes; RAR archive with CoNLL format",
splits: ["train", "test"],
tasks: ["ner", "slot_filling"],
categories: [ner, arcane_domain],
},
AnnoCTR {
name: "AnnoCTR (Cyber Threat Reports)",
description: "Cyber threat intelligence NER with MITRE ATT&CK linking. 400 annotated documents from commercial CTI vendors.",
url: "https://github.com/boschresearch/anno-ctr-lrec-coling-2024/archive/refs/heads/main.zip",
entity_types: ["ORGANIZATION", "LOCATION", "SECTOR", "TIME", "CODE", "THREAT_ACTOR", "MALWARE", "TOOL", "TACTIC", "TECHNIQUE"],
language: "en",
domain: "cybersecurity",
license: "CC-BY-SA-4.0",
citation: "Lange et al. (2024)",
paper_url: "https://arxiv.org/abs/2404.07765",
year: 2024,
format: "JSONL",
annotation_scheme: "BIO",
size_hint: "400 documents, multi-layer annotation",
notes: "LREC-COLING 2024; links to Wikipedia and MITRE ATT&CK KB; includes entity linking task",
splits: ["train", "dev", "test"],
tasks: ["ner", "entity_linking"],
categories: [ner, arcane_domain],
},
CRAFT {
name: "CRAFT",
description: "Colorado Richly Annotated Full-Text. 97 PubMed articles with multi-layer annotation including coreference.",
url: "https://github.com/UCDenver-ccp/CRAFT/archive/refs/heads/master.zip",
entity_types: ["GENE", "PROTEIN", "CHEMICAL", "CELL", "ORGANISM"],
language: "en",
domain: "biomedical",
license: "CC-BY-3.0",
citation: "Bada et al. (2012)",
paper_url: "https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-13-161",
year: 2012,
format: "XML",
annotation_scheme: "Standoff",
size_hint: "97 full-text articles, ~790k tokens",
notes: "Full-text (not just abstracts); 10 ontologies used for normalization",
categories: [coref, biomedical, arcane_domain],
},
WNUT16 {
name: "WNUT-16",
description: "Twitter NER workshop shared task. Focus on rare and emerging entities in noisy social media text.",
url: "https://raw.githubusercontent.com/aritter/twitter_nlp/65f3d77134c40d920db8d431c5c6faef1c051c94/data/annotated/wnut16/data/test",
entity_types: ["person", "geo-loc", "company", "facility", "product", "other"],
language: "en",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Strauss et al. (2016)",
paper_url: "https://aclanthology.org/W16-3919/",
year: 2016,
format: "CoNLL",
annotation_scheme: "BIO",
size_hint: "3,856 test tweets, 2,394 train",
notes: "89% unseen test entities; predecessor to WNUT-17; harder than standard benchmarks",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, social_media, adversarial],
},
SanskritUD {
name: "Sanskrit UD",
description: "Universal Dependencies for Vedic and Classical Sanskrit. Includes Vedas and epics.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Sanskrit-Vedic/master/sa_vedic-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "sa",
domain: "religious",
license: "CC-BY-SA-4.0",
citation: "Hellwig et al. (2020)",
paper_url: "https://aclanthology.org/2020.lrec-1.632/",
year: 2020,
format: "CoNLLU",
notes: "Oldest Indo-European language with extensive NLP resources; Devanagari script",
categories: [ner, ancient, low_resource],
},
OldEnglishUD {
name: "Old English UD",
description: "Universal Dependencies for Old English (Anglo-Saxon). York-Toronto-Helsinki corpus.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_English-Cairo/118f11ad906fb15d930825fadce9ef9eccca9347/ang_cairo-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "ang",
domain: "historical",
license: "CC-BY-SA-4.0",
citation: "Tischler & Walkden (2019)",
paper_url: "https://aclanthology.org/W19-4214/",
year: 2019,
format: "CoNLLU",
notes: "Anglo-Saxon; insular script variations; 5th-11th century CE. NOTE: UD repo naming drifts; URL pinned to a specific commit.",
categories: [ner, ancient, historical, low_resource],
},
OldNorseUD {
name: "Old Norse UD",
description: "Universal Dependencies for Old Norse/Icelandic Sagas. PROIEL and ISWOC treebanks.",
url: "https://raw.githubusercontent.com/UniversalDependencies/UD_Icelandic-IcePaHC/ca72a59affd87c4b7b9067ae56efa7a694a7b4c4/is_icepahc-ud-test.conllu",
entity_types: ["PER", "LOC", "ORG"],
language: "non",
domain: "literature",
license: "CC-BY-SA-4.0",
citation: "Rögnvaldsson et al. (2012)",
paper_url: "https://aclanthology.org/L12-1148/",
year: 2012,
format: "CoNLLU",
notes: "IcePaHC treebank (historical Icelandic; Old Norse family). URL pinned to a specific commit.",
categories: [ner, ancient, literary, low_resource],
},
CALCS2018 {
name: "CALCS-2018",
description: "Code-Switching Workshop shared task. English-Spanish Twitter NER with 9 entity types.",
url: "https://code-switching.github.io/2018/",
entity_types: ["PER", "LOC", "ORG", "GROUP", "TITLE", "PROD", "EVENT", "TIME", "OTHER"],
language: "mul",
domain: "social_media",
license: "Research",
citation: "Aguilar et al. (2018)",
paper_url: "https://aclanthology.org/W18-3219/",
year: 2018,
format: "CoNLL",
annotation_scheme: "BIO",
notes: "Spanglish; first major code-switching NER shared task",
categories: [ner, social_media, multilingual, low_resource],
},
HinglishNER {
name: "Hinglish NER",
description: "Hindi-English code-mixed social media NER. Roman script Hindi mixed with English.",
url: "https://github.com/murali1996/CodemixedNLP",
entity_types: ["PER", "LOC", "ORG"],
language: "mul",
domain: "social_media",
license: "CC-BY-4.0",
citation: "Priyadharshini et al. (2020)",
paper_url: "https://aclanthology.org/2020.calcs-1.6/",
year: 2020,
format: "JSONL",
annotation_scheme: "BIO",
notes: "GLUECoS/LinCE benchmark; download via CodemixedNLP toolkit; Romanized Hindi; ~400M speakers use code-switching daily",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, social_media, multilingual, low_resource],
},
MedievalCharterNER {
name: "Medieval Charter NER",
description: "Multilingual medieval charter NER. Latin, French, Spanish from major charter collections.",
url: "https://zenodo.org/records/6463699",
entity_types: ["PER", "LOC", "ORG", "DATE"],
language: "mul",
domain: "historical",
license: "CC-BY-4.0",
citation: "Camps et al. (2022)",
paper_url: "https://aclanthology.org/2022.lrec-1.530/",
year: 2022,
format: "CoNLL",
size_hint: "~100k tokens across 4 charter collections",
notes: "HOME-ALCAR, CBMA, Diplomata Belgica, CODEA; medieval Latin/vernacular",
categories: [ner, historical, multilingual, low_resource],
},
CBMACharters {
name: "CBMA Charters",
description: "Burgundian medieval Latin charters NER. 9th-14th century diplomatic documents.",
url: "", entity_types: ["PER", "LOC", "ORG", "DATE", "TITLE"],
language: "la",
domain: "historical",
license: "Research",
citation: "Perreaux (2021)",
paper_url: "https://dhq-static.digitalhumanities.org/pdf/000574.pdf",
year: 2021,
format: "CoNLL",
notes: "Chartae Burgundiae Medii Aevi; medieval Latin; notarial hands",
access_status: ContactAuthors,
categories: [ner, ancient, historical, low_resource],
},
MSNER {
name: "MSNER",
description: "Multilingual Spoken NER. Speech-to-NER on VoxPopuli parliamentary speeches.",
url: "https://rdr.kuleuven.be/dataset.xhtml?persistentId=doi:10.48804/ZTVMIX",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "mul",
domain: "speech",
license: "CC-BY-4.0",
citation: "Evain et al. (2024)",
paper_url: "https://aclanthology.org/2024.isa-1.2/",
year: 2024,
format: "CoNLL",
annotation_scheme: "BIO",
size_hint: "~590h train, 17h gold test",
notes: "Dutch, French, German, Spanish; ASR transcripts; first multilingual speech NER corpus",
categories: [ner, speech, multilingual],
},
NoiseBench {
name: "NoiseBench",
description: "Robustness benchmark for NER. 6 real noise types: expert, crowd, LLM, distant/weak supervision.",
url: "https://raw.githubusercontent.com/elenamer/NoiseBench/main/data/annotations/clean.traindev",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "evaluation",
license: "MIT",
citation: "Merhej et al. (2024)",
paper_url: "https://aclanthology.org/2024.emnlp-main.1011/",
year: 2024,
format: "CoNLL",
size_hint: "CoNLL-03 subset with 7 label variants",
notes: "Compares simulated vs real label noise; includes German variant; using clean subset",
tasks: ["ner"],
access_status: Public,
categories: [ner, adversarial],
},
RockNER {
name: "RockNER",
description: "Robustness benchmark for NER. Real-world adversarial examples with boundary ambiguity.",
url: "https://github.com/INK-USC/RockNER",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "evaluation",
license: "Apache-2.0",
citation: "Lin et al. (2021)",
paper_url: "https://aclanthology.org/2021.acl-long.340/",
year: 2021,
format: "CoNLL",
size_hint: "~1.5k challenging examples",
notes: "ACL 2021; entity boundary attacks, rare entities, syntactic perturbations; robustness stress test",
splits: ["test"],
tasks: ["ner"],
categories: [ner, adversarial],
},
CrossWeigh {
name: "CrossWeigh",
description: "Cross-lingual adversarial NER evaluation. Tests multilingual model robustness.",
url: "https://raw.githubusercontent.com/ZihanWangKi/CrossWeigh/master/data/conllpp_test.txt",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "en",
domain: "evaluation",
license: "MIT",
citation: "Wang et al. (2019)",
paper_url: "https://aclanthology.org/D19-1519/",
year: 2019,
format: "CoNLL",
size_hint: "Adversarial cross-lingual test sets; includes CoNLL++ cleaned version",
notes: "Tests cross-lingual transfer robustness; character/word perturbations; zero-shot evaluation; CoNLL++ fix",
splits: ["test"],
tasks: ["ner"],
access_status: Public,
categories: [ner, adversarial, multilingual],
},