BC5CDR {
name: "BC5CDR",
description: "Biomedical NER for diseases and chemicals. Created for BioCreative V CDR task, a major biomedical NLP benchmark.",
url: "https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/bc5cdr/test.txt",
entity_types: ["Chemical", "Disease"],
language: "en",
domain: "biomedical",
license: "Public",
citation: "Li et al. (2016)",
paper_url: "https://academic.oup.com/database/article/doi/10.1093/database/baw068/2630414",
year: 2016,
format: "BIO",
annotation_scheme: "BIO",
size_hint: "~1500 PubMed abstracts, ~14k mentions",
example: "Aspirin B-Chemical\ninduced O\nhepatotoxicity B-Disease\nwas O\nobserved O\n. O",
tasks: ["ner"],
hf_id: "tner/bc5cdr",
categories: [ner, biomedical],
},
NCBIDisease {
name: "NCBI Disease",
description: "NCBI disease mentions corpus. Foundational resource for disease NER from NIH.",
url: "https://raw.githubusercontent.com/shreyashub/BioFLAIR/master/data/ner/NCBI-disease/test.txt",
entity_types: ["Disease"],
language: "en",
domain: "biomedical",
license: "Public",
citation: "Dogan et al. (2014)",
paper_url: "https://www.sciencedirect.com/science/article/pii/S1532046413001974",
year: 2014,
format: "BIO",
annotation_scheme: "BIO",
size_hint: "~800 PubMed abstracts, ~6k mentions",
example: "The O\npatient O\nwas O\ndiagnosed O\nwith O\ntype B-Disease\n2 I-Disease\ndiabetes I-Disease\n. O",
tasks: ["ner"],
hf_id: "ncbi_disease",
categories: [ner, biomedical],
},
GENIA {
name: "GENIA",
description: "Biomedical NER for molecular biology. First large-scale biomedical NER corpus; historically significant.",
url: "https://huggingface.co/datasets/chufangao/GENIA-NER",
entity_types: ["DNA", "RNA", "protein", "cell_line", "cell_type"],
language: "en",
domain: "biomedical",
license: "GENIA Project License",
citation: "Kim et al. (2003)",
paper_url: "https://academic.oup.com/bioinformatics/article/19/suppl_1/i180/227927",
year: 2003,
format: "XML",
annotation_scheme: "Standoff",
size_hint: "2000 MEDLINE abstracts, ~100k entities",
example: "The B-protein\nNF-kappa B I-protein\nprotein I-protein\nbinds O\nto O\nthe B-DNA\nkappa B I-DNA\nbinding I-DNA\nsite I-DNA\n. O",
notes: "Nested entities common; requires special handling; pioneered biomedical NER",
hf_id: "chufangao/GENIA-NER",
categories: [ner, biomedical],
},
AnatEM {
name: "AnatEM",
description: "Anatomical entity mention corpus. 1,212 PubMed abstracts with anatomical structures.",
url: "https://huggingface.co/datasets/disi-unibo-nlp/AnatEM",
entity_types: ["Anatomy"],
language: "en",
domain: "biomedical",
license: "CC-BY-4.0",
citation: "Ohta et al. (2012)",
paper_url: "https://aclanthology.org/W12-2402/",
year: 2012,
format: "Standoff",
size_hint: "1,212 abstracts, ~7k entity mentions",
notes: "Fine-grained anatomical mentions; standalone or nested within other entities",
hf_id: "disi-unibo-nlp/AnatEM",
categories: [ner, biomedical],
},
BC2GM {
name: "BC2GM",
description: "BioCreative II Gene Mention recognition. Gold-standard gene/protein name tagging.",
url: "https://huggingface.co/datasets/spyysalo/bc2gm_corpus",
entity_types: ["Gene", "Protein"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Smith et al. (2008)",
paper_url: "https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-s2-s2",
year: 2008,
format: "IOB2",
size_hint: "20k sentences, ~24k gene mentions",
example: "The B-Gene\np53 I-Gene\nprotein I-Gene\nregulates O\ncell O\ncycle O\narrest O\n. O",
notes: "Classic benchmark for gene/protein NER; BioCreative shared task",
tasks: ["ner"],
hf_id: "spyysalo/bc2gm_corpus",
access_status: Public,
categories: [ner, biomedical],
},
BC4CHEMD {
name: "BC4CHEMD",
description: "BioCreative IV Chemical Entity Mention Detection. Drug and chemical name recognition.",
url: "https://huggingface.co/datasets/chintagunta85/bc4chemd",
entity_types: ["Chemical"],
language: "en",
domain: "biomedical",
license: "Research",
citation: "Krallinger et al. (2015)",
paper_url: "https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-7-S1-S2",
year: 2015,
format: "IOB2",
size_hint: "10k PubMed abstracts, ~84k chemical mentions",
example: "Treatment O\nwith O\nB-Chemical\naspirin I-Chemical\nreduced O\ninflammation O\n. O",
notes: "Chemical NER benchmark; includes IUPAC names, trivial names, abbreviations",
tasks: ["ner"],
hf_id: "chintagunta85/bc4chemd",
access_status: Public,
categories: [ner, biomedical],
},