GAP {
name: "GAP",
description: "Gender Ambiguous Pronoun resolution. Google's benchmark for exposing gender bias in coreference systems.",
url: "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv",
entity_types: ["PER"],
language: "en",
domain: "wikipedia",
license: "Apache-2.0",
citation: "Webster et al. (2018)",
paper_url: "https://aclanthology.org/Q18-1042/",
year: 2018,
format: "TSV",
size_hint: "8,908 pronoun-name pairs",
example: "ID\tText\tPronoun\tA\tB\tA-coref\ntest-1\tZoe met Alice and she waved.\tshe\tZoe\tAlice\tFALSE",
notes: "Designed to expose gender bias; Kaggle shared task; balanced male/female",
tasks: ["coref"],
hf_id: "google-gap-coreference/gap",
categories: [coref, bias_evaluation],
},
PreCo {
name: "PreCo",
description: "Large-scale coreference from PreCo reading comprehension corpus. 10x larger than OntoNotes.",
url: "https://huggingface.co/datasets/coref-data/preco_raw",
entity_types: ["PER", "LOC", "ORG"],
language: "en",
domain: "reading_comprehension",
license: "CC-BY-4.0",
citation: "Chen et al. (2018)",
paper_url: "https://aclanthology.org/D18-1016/",
year: 2018,
format: "JSONL",
annotation_scheme: "CoNLLCoref",
size_hint: "38k documents, includes singletons",
notes: "Preschool vocabulary for cleaner evaluation; largest public coref corpus",
splits: ["train", "dev", "test"],
tasks: ["coref"],
hf_id: "coref-data/preco_raw",
categories: [coref],
},
LitBank {
name: "LitBank",
description: "Literary coreference. 100 public-domain English fiction works (1719-1922) with ACE-style entities.",
url: "https://raw.githubusercontent.com/dbamman/litbank/master/coref/brat/1023_bleak_house_brat.ann",
entity_types: ["PER", "LOC", "ORG", "GPE", "FAC", "VEH"],
language: "en",
domain: "literature",
license: "CC-BY-4.0",
citation: "Bamman et al. (2019)",
paper_url: "https://aclanthology.org/P19-1353/",
year: 2019,
format: "BRAT",
annotation_scheme: "Standoff",
size_hint: "100 novels, ~2k tokens each",
notes: "Focus on character coreference; includes event coref; public domain texts",
splits: ["all"],
tasks: ["ner", "coref", "event_coref"],
expected_docs: 100,
categories: [coref, literary],
},
ECBPlus {
name: "ECB+",
description: "Event Coreference Bank Plus. Standard benchmark for cross-document event coreference resolution.",
url: "https://raw.githubusercontent.com/cltl/ecbPlus/master/ECB%2B_LREC2014/ECB%2B.zip",
entity_types: ["EVENT", "TIME", "LOC", "PARTICIPANT"],
language: "en",
domain: "news",
license: "CC-BY-3.0",
citation: "Cybulska & Vossen (2014)",
paper_url: "https://aclanthology.org/L14-1646/",
year: 2014,
format: "XML-ZIP",
size_hint: "43 topics, 982 docs, ~7k events",
example: "Doc1: 'The earthquake [struck] at 3am.' Doc2: 'The [tremor] caused damage.'\nEvents: struck_1, tremor_2 -> coreferent (same event)",
notes: "De facto CDCR standard; topic-clustered structure may cause overfitting",
splits: ["train", "dev", "test"],
tasks: ["coref", "event_coref", "cdcr"],
expected_docs: 982,
categories: [coref, event_coref],
},
OntoNotesCoref {
name: "OntoNotes Coreference",
description: "OntoNotes 5.0 coreference annotations. Gold-standard multi-genre coref including WSJ, broadcast, web.",
url: "https://catalog.ldc.upenn.edu/LDC2013T19",
entity_types: ["PER", "ORG", "GPE", "NORP"],
language: "en",
domain: "mixed",
license: "LDC",
citation: "Pradhan et al. (2012)",
paper_url: "https://aclanthology.org/W12-4501/",
year: 2012,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "3,493 documents, ~1.6M tokens",
notes: "De facto standard for within-document coreference evaluation",
categories: [coref],
},
WikiCoref {
name: "WikiCoref",
description: "Wikipedia coreference corpus. 30 documents with full coreference annotation.",
url: "",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "wikipedia",
license: "CC-BY-SA-4.0",
citation: "Ghaddar & Langlais (2016)",
paper_url: "https://aclanthology.org/C16-1252/",
year: 2016,
format: "CoNLL",
size_hint: "30 documents, ~60k tokens",
notes: "Long documents averaging 2k tokens; challenging for span-based models. Prior download URL has an expired TLS cert; needs a fresh mirror.",
access_status: Deprecated,
categories: [coref],
},
ARRAU3 {
name: "ARRAU 3.0",
description: "Anaphora Resolution and Underspecification corpus version 3. Multi-genre with rich annotation.",
url: "https://aclanthology.org/2024.codi-1.12/",
entity_types: ["PER", "ORG", "LOC", "Event"],
language: "en",
domain: "mixed",
license: "Research",
citation: "Uryupina et al. (2024)",
paper_url: "https://aclanthology.org/2024.codi-1.12/",
year: 2024,
format: "MMAX2",
annotation_scheme: "ARRAU",
size_hint: "~350k tokens across multiple genres",
notes: "Rich annotation including bridging, discourse deixis, and ambiguity",
categories: [coref],
},
AMIMeeting {
name: "AMI Meeting",
description: "Meeting transcripts with coreference and dialogue act annotation.",
url: "https://groups.inf.ed.ac.uk/ami/download/",
entity_types: ["PER", "ORG", "LOC"],
language: "en",
domain: "dialogue",
license: "CC-BY-4.0",
citation: "Carletta et al. (2005)",
paper_url: "https://groups.inf.ed.ac.uk/ami/icsi/",
year: 2005,
format: "XML",
size_hint: "100 hours of meetings",
notes: "Multi-party dialogue; includes prosody and head gestures",
categories: [coref, dialogue],
},
CLEFClinicalCoref {
name: "CLEF Clinical Coreference",
description: "Clinical coreference from ShARe/CLEF eHealth. Patient records with coref.",
url: "",
entity_types: ["Disorder", "Drug", "Procedure"],
language: "en",
domain: "clinical",
license: "PhysioNet",
citation: "Suominen et al. (2013)",
paper_url: "https://clef2013.clef-initiative.eu/index.php?page=pages/proceedings.php",
year: 2013,
format: "Standoff",
size_hint: "298 discharge summaries",
notes: "Clinical concept coreference; disorder mentions across sentences. Was hosted on PhysioNet; URL appears gone/renamed; requires controlled access.",
access_status: Registration,
categories: [coref, biomedical],
},
RSTDT {
name: "RST Discourse Treebank",
description: "Penn Discourse Treebank with RST annotations. Discourse relations and structure.",
url: "https://catalog.ldc.upenn.edu/LDC2002T07",
entity_types: [],
language: "en",
domain: "news",
license: "LDC",
citation: "Carlson et al. (2001)",
paper_url: "https://aclanthology.org/A00-1036/",
year: 2001,
format: "Custom",
size_hint: "385 WSJ articles",
notes: "RST discourse structure; useful for discourse-aware coreference",
categories: [coref],
},
WinoBias {
name: "WinoBias",
description: "Coreference bias benchmark. Winograd-schema sentences testing occupational gender stereotypes.",
url: "https://raw.githubusercontent.com/uclanlp/corefBias/master/WinoBias/wino/data/anti_stereotyped_type1.txt.dev",
entity_types: ["PER"],
language: "en",
domain: "evaluation",
license: "MIT",
citation: "Zhao et al. (2018)",
paper_url: "https://aclanthology.org/N18-2003/",
year: 2018,
format: "Custom",
size_hint: "3,160 sentences",
notes: "Type 1 (syntactic) and Type 2 (semantic) splits; tests BLS occupational stats",
categories: [coref, bias_evaluation],
},
QxoRef {
name: "qxoRef",
description: "First coreference corpus for Conchucos Quechua. Historically significant as first indigenous coref resource.",
url: "https://raw.githubusercontent.com/elizabethpankratz/qxoRef/f0eb5716573b3f428bfcfdda923b195d0e7967b8/qxoRef_AZ23.conll",
entity_types: ["PER", "LOC", "ORG"],
language: "qxo",
domain: "narrative",
license: "CC-BY-NC-SA-4.0",
citation: "Rios (2021)",
paper_url: "https://aclanthology.org/2021.americasnlp-1.1/",
year: 2021,
format: "CoNLL",
annotation_scheme: "CoNLLCoref",
size_hint: "12 docs, 332 mentions",
notes: "First indigenous coreference corpus; pro-drop language; agglutinative morphology",
categories: [coref, indigenous, low_resource],
},
AmericasNLI {
name: "AmericasNLI",
description: "NLI for 10 Indigenous American languages (Quechua, Guaraní, Nahuatl, etc.).",
url: "https://raw.githubusercontent.com/nala-cub/AmericasNLI/be3c351b7e1ae69936c61bfde3e24f30757db9ac/test.tsv",
entity_types: [],
language: "mul",
domain: "general",
license: "CC-BY-4.0",
citation: "Ebrahimi et al. (2022)",
paper_url: "https://aclanthology.org/2022.acl-long.435/",
year: 2022,
format: "TSV",
notes: "Tests zero-shot transfer from multilingual models; 10 indigenous languages",
categories: [indigenous, multilingual, low_resource],
},
CherokeeNER {
name: "Cherokee NER",
description: "Cherokee-English parallel corpus for NER transfer. Uses Syllabary script.",
url: "",
entity_types: ["PER", "LOC", "ORG"],
language: "chr",
domain: "indigenous",
license: "Research",
citation: "Zhang et al. (2020)",
paper_url: "https://aclanthology.org/2020.findings-emnlp.464/",
year: 2020,
format: "Custom",
notes: "Syllabary script (85 characters); polysynthetic language; ~7k speakers. Prior URL is dead; needs a fresh mirror.",
access_status: Deprecated,
categories: [ner, indigenous, low_resource],
},
NahuatlNER {
name: "Nahuatl NER",
description: "Named entity recognition for Nahuatl (Aztec language). Colonial-era texts and modern usage.",
url: "",
entity_types: ["PER", "LOC", "ORG"],
language: "nah",
domain: "historical",
license: "CC-BY-4.0",
citation: "Gutierrez-Vasquez et al. (2023)",
year: 2023,
format: "CoNLL",
notes: "Polysynthetic Uto-Aztecan language; ~1.7M speakers; includes colonial manuscripts. Prior URL is dead; needs a fresh mirror.",
access_status: Deprecated,
categories: [ner, indigenous, low_resource, historical],
},
MaoriNER {
name: "Māori NER",
description: "Named entity recognition for Te Reo Māori. New Zealand indigenous language corpus.",
url: "",
entity_types: ["PER", "LOC", "ORG"],
language: "mi",
domain: "indigenous",
license: "Research",
citation: "Te Hiku Media (2022)",
year: 2022,
format: "JSONL",
notes: "Polynesian language; ~50k fluent speakers; limited training data available",
access_status: ContactAuthors,
categories: [ner, indigenous, low_resource],
},
WelshNER {
name: "Welsh NER",
description: "Named entity recognition for Welsh (Cymraeg). Celtic language NER corpus.",
url: "",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "cy",
domain: "news",
license: "CC-BY-4.0",
citation: "Roberts et al. (2021)",
year: 2021,
format: "CoNLL",
notes: "Celtic language; ~900k speakers; supports Welsh-specific entity types. Prior URL is dead; needs a fresh mirror.",
access_status: Deprecated,
categories: [ner, indigenous, low_resource],
},
BasqueNER {
name: "Basque NER",
description: "Named entity recognition for Basque (Euskara). Language isolate NER corpus.",
url: "",
entity_types: ["PER", "LOC", "ORG", "MISC"],
language: "eu",
domain: "news",
license: "CC-BY-4.0",
citation: "Alegria et al. (2019)",
year: 2019,
format: "CoNLL",
size_hint: "~80k tokens",
notes: "Language isolate; agglutinative morphology; ~750k speakers; ergative-absolutive alignment. Prior URL is dead; needs a fresh mirror.",
access_status: Deprecated,
categories: [ner, indigenous, low_resource],
},
HIPE2022 {
name: "HIPE-2022",
description: "Multilingual Historical NER. 6 datasets across 11 languages including Latin.",
url: "https://raw.githubusercontent.com/hipe-eval/HIPE-2022-data/147f5bc3c7fb7e5c6b024a9ffd6503cd019fb9ea/data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-test-de.tsv",
entity_types: ["PER", "LOC", "ORG", "PROD"],
language: "mul",
domain: "historical",
license: "CC-BY-NC-4.0",
citation: "Ehrmann et al. (2022)",
paper_url: "https://ceur-ws.org/Vol-3180/paper-83.pdf",
year: 2022,
format: "TSV",
annotation_scheme: "IOB2",
notes: "CLEF-HIPE shared task; includes Latin and Classical commentary; OCR noise",
splits: ["test"],
tasks: ["ner"],
categories: [ner, historical, multilingual],
},
HistNERo {
name: "HistNERo",
description: "Romanian historical newspaper NER. First Romanian historical NER corpus from four regions.",
url: "https://github.com/avramandrei/histnero",
entity_types: ["PER", "LOC", "ORG", "DATE", "MISC"],
language: "ro",
domain: "historical",
license: "CC-BY-4.0",
citation: "HistNERo Team (2024)",
paper_url: "https://arxiv.org/abs/2405.00155",
year: 2024,
format: "CoNLL",
size_hint: "~323k tokens, 19th-20th century newspapers",
notes: "Four historical Romanian regions (Bessarabia, Moldavia, Transylvania, Wallachia); diachronic benchmark",
splits: ["train", "dev", "test"],
tasks: ["ner"],
categories: [ner, historical, low_resource],
},
QuaeroOldPress {
name: "Quaero Old Press",
description: "French historical newspaper NER from 1890. OCR-corrected with manual NE annotations.",
url: "",
entity_types: ["PER", "LOC", "ORG", "TIME", "PROD"],
language: "fr",
domain: "historical",
license: "Research",
citation: "Galibert et al. (2012)",
year: 2012,
format: "XML",
size_hint: "295 pages, 1890 newspapers",
notes: "French historical NER benchmark; manual OCR corrections; reasonably clean historical text",
splits: ["test"],
tasks: ["ner"],
access_status: ContactAuthors,
categories: [ner, historical],
},
HistoricalChineseNER {
name: "Historical Chinese NER",
description: "Multi-task historical Chinese corpus. NER + entity linking + coreference + relations.",
url: "",
entity_types: ["PER", "LOC", "ORG", "TIME", "OFFICIAL"],
language: "zh",
domain: "historical",
license: "Research",
citation: "LREC-COLING (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.35.pdf",
year: 2024,
format: "JSONL",
size_hint: "Historical Chinese newspapers + documents",
notes: "LREC-COLING 2024; multi-task historical IE benchmark; cross-genre historical Chinese",
splits: ["train", "dev", "test"],
tasks: ["ner", "el", "coref", "re"],
access_status: ContactAuthors,
categories: [ner, entity_linking, coref, historical, multilingual],
},
CHisIEC {
name: "CHisIEC",
description: "Chinese Historical Information Extraction Corpus. Ancient Chinese NER + RE with 12 relation types.",
url: "https://raw.githubusercontent.com/tangxuemei1995/CHisIEC/main/data/re/coling_test.json",
entity_types: ["PER", "LOC", "OFI", "BOOK"],
language: "lzh",
domain: "historical",
license: "Research",
citation: "Tang et al. (2024)",
paper_url: "https://aclanthology.org/2024.lrec-main.283/",
year: 2024,
format: "JSON",
size_hint: "3,891 paragraphs, 13,520 entities, 8,228 relations",
notes: "Ancient Chinese dynastic histories (24史); 12 domain-specific relations for historical socio-political structures; pre-modern Chinese (文言文)",
splits: ["train", "dev", "test"],
tasks: ["ner", "re"],
categories: [ner, relation_extraction, historical, ancient],
},