use bitflags::bitflags;
bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct DatasetFlags: u32 {
const NER = 1 << 0;
const COREFERENCE = 1 << 1;
const INTRA_DOC_COREF = 1 << 2;
const INTER_DOC_COREF = 1 << 3;
const TEMPORAL_NER = 1 << 4;
const BIOMEDICAL = 1 << 5;
const SOCIAL_MEDIA = 1 << 6;
const SPECIALIZED_DOMAIN = 1 << 7;
const RELATION_EXTRACTION = 1 << 8;
const HISTORICAL = 1 << 9;
const BIAS_EVALUATION = 1 << 10;
const DIALOGUE_COREF = 1 << 11;
const JOINT_NER_RE = 1 << 12;
const DISCONTINUOUS_NER = 1 << 13;
const FEW_SHOT = 1 << 14;
const MULTILINGUAL = 1 << 15;
const CONSTRUCTED_LANGUAGE = 1 << 16;
const CODE_SWITCHING = 1 << 17;
const AFRICAN_LANGUAGE = 1 << 18;
const ENTITY_LINKING = 1 << 19;
const EVENT_EXTRACTION = 1 << 20;
const LEGAL = 1 << 21;
const FINANCIAL = 1 << 22;
const SCIENTIFIC = 1 << 23;
const LITERARY = 1 << 24;
const NEWS = 1 << 25;
const LOW_RESOURCE = 1 << 26;
}
}
impl Default for DatasetFlags {
fn default() -> Self {
Self::NER
}
}
#[derive(Debug, Clone, Copy)]
pub struct DatasetMetadata {
pub name: &'static str,
pub description: &'static str,
pub download_url: &'static str,
pub domain: &'static str,
pub language: &'static str,
pub entity_types: &'static [&'static str],
pub flags: DatasetFlags,
pub citation: Option<&'static str>,
pub license: Option<&'static str>,
pub year: Option<u16>,
pub paper_url: Option<&'static str>,
}
impl DatasetMetadata {
#[must_use]
pub const fn new(
name: &'static str,
description: &'static str,
download_url: &'static str,
) -> Self {
Self {
name,
description,
download_url,
domain: "general",
language: "en",
entity_types: &[],
flags: DatasetFlags::NER,
citation: None,
license: None,
year: None,
paper_url: None,
}
}
#[must_use]
pub const fn domain(mut self, domain: &'static str) -> Self {
self.domain = domain;
self
}
#[must_use]
pub const fn language(mut self, language: &'static str) -> Self {
self.language = language;
self
}
#[must_use]
pub const fn entity_types(mut self, types: &'static [&'static str]) -> Self {
self.entity_types = types;
self
}
#[must_use]
pub const fn flags(mut self, flags: DatasetFlags) -> Self {
self.flags = flags;
self
}
#[must_use]
pub const fn citation(mut self, citation: &'static str) -> Self {
self.citation = Some(citation);
self
}
#[must_use]
pub const fn license(mut self, license: &'static str) -> Self {
self.license = Some(license);
self
}
#[must_use]
pub const fn year(mut self, year: u16) -> Self {
self.year = Some(year);
self
}
#[must_use]
pub const fn paper_url(mut self, url: &'static str) -> Self {
self.paper_url = Some(url);
self
}
#[inline]
pub const fn is_ner(&self) -> bool {
self.flags.contains(DatasetFlags::NER)
}
#[inline]
pub const fn is_coreference(&self) -> bool {
self.flags.contains(DatasetFlags::COREFERENCE)
}
#[inline]
pub const fn is_intra_doc_coref(&self) -> bool {
self.flags.contains(DatasetFlags::INTRA_DOC_COREF)
}
#[inline]
pub const fn is_inter_doc_coref(&self) -> bool {
self.flags.contains(DatasetFlags::INTER_DOC_COREF)
}
#[inline]
pub const fn is_temporal_ner(&self) -> bool {
self.flags.contains(DatasetFlags::TEMPORAL_NER)
}
#[inline]
pub const fn is_biomedical(&self) -> bool {
self.flags.contains(DatasetFlags::BIOMEDICAL)
}
#[inline]
pub const fn is_social_media(&self) -> bool {
self.flags.contains(DatasetFlags::SOCIAL_MEDIA)
}
#[inline]
pub const fn is_specialized_domain(&self) -> bool {
self.flags.contains(DatasetFlags::SPECIALIZED_DOMAIN)
}
#[inline]
pub const fn is_relation_extraction(&self) -> bool {
self.flags.contains(DatasetFlags::RELATION_EXTRACTION)
}
#[inline]
pub const fn is_historical(&self) -> bool {
self.flags.contains(DatasetFlags::HISTORICAL)
}
#[inline]
pub const fn is_bias_evaluation(&self) -> bool {
self.flags.contains(DatasetFlags::BIAS_EVALUATION)
}
#[inline]
pub const fn is_dialogue_coref(&self) -> bool {
self.flags.contains(DatasetFlags::DIALOGUE_COREF)
}
#[inline]
pub const fn is_joint_ner_re(&self) -> bool {
self.flags.contains(DatasetFlags::JOINT_NER_RE)
}
#[inline]
pub const fn is_discontinuous_ner(&self) -> bool {
self.flags.contains(DatasetFlags::DISCONTINUOUS_NER)
}
#[inline]
pub const fn is_few_shot(&self) -> bool {
self.flags.contains(DatasetFlags::FEW_SHOT)
}
#[inline]
pub const fn is_multilingual(&self) -> bool {
self.flags.contains(DatasetFlags::MULTILINGUAL)
}
#[inline]
pub const fn is_constructed_language(&self) -> bool {
self.flags.contains(DatasetFlags::CONSTRUCTED_LANGUAGE)
}
#[inline]
pub const fn is_code_switching(&self) -> bool {
self.flags.contains(DatasetFlags::CODE_SWITCHING)
}
#[inline]
pub const fn is_african_language(&self) -> bool {
self.flags.contains(DatasetFlags::AFRICAN_LANGUAGE)
}
}
pub static CONLL_TYPES: &[&str] = &["PER", "LOC", "ORG", "MISC"];
pub static ONTONOTES_TYPES: &[&str] = &[
"PERSON",
"NORP",
"FAC",
"ORG",
"GPE",
"LOC",
"PRODUCT",
"EVENT",
"WORK_OF_ART",
"LAW",
"LANGUAGE",
"DATE",
"TIME",
"PERCENT",
"MONEY",
"QUANTITY",
"ORDINAL",
"CARDINAL",
];
pub static BIO_TYPES: &[&str] = &["Chemical", "Disease", "Gene", "Species"];
pub static ACE_TYPES: &[&str] = &["PER", "ORG", "GPE", "LOC", "FAC", "VEH", "WEA"];
pub static WIKIGOLD: DatasetMetadata = DatasetMetadata::new(
"WikiGold",
"Wikipedia-based NER (PER, LOC, ORG, MISC)",
"https://huggingface.co/datasets/wikigold",
)
.domain("news")
.language("en")
.entity_types(CONLL_TYPES)
.flags(DatasetFlags::NER.union(DatasetFlags::NEWS))
.year(2009);
pub static WNUT17: DatasetMetadata = DatasetMetadata::new(
"WNUT-17",
"Social media NER (emerging entities)",
"https://huggingface.co/datasets/wnut_17",
)
.domain("social-media")
.language("en")
.entity_types(&[
"person",
"location",
"corporation",
"product",
"creative-work",
"group",
])
.flags(DatasetFlags::NER.union(DatasetFlags::SOCIAL_MEDIA))
.year(2017);
pub static BC5CDR: DatasetMetadata = DatasetMetadata::new(
"BC5CDR",
"Biomedical NER (chemicals, diseases)",
"https://huggingface.co/datasets/bc5cdr",
)
.domain("biomedical")
.language("en")
.entity_types(&["Chemical", "Disease"])
.flags(
DatasetFlags::NER
.union(DatasetFlags::BIOMEDICAL)
.union(DatasetFlags::SPECIALIZED_DOMAIN),
)
.year(2015);
pub static GAP: DatasetMetadata = DatasetMetadata::new(
"GAP",
"Gendered Ambiguous Pronouns",
"https://huggingface.co/datasets/gap",
)
.domain("coreference")
.language("en")
.entity_types(&["Pronoun", "Name"])
.flags(
DatasetFlags::COREFERENCE
.union(DatasetFlags::INTRA_DOC_COREF)
.union(DatasetFlags::BIAS_EVALUATION),
)
.year(2018);
pub static ONTONOTES_COREF: DatasetMetadata = DatasetMetadata::new(
"OntoNotes 5.0 (Coreference)",
"Standard coreference benchmark",
"https://catalog.ldc.upenn.edu/LDC2013T19",
)
.domain("coreference")
.language("en")
.entity_types(ONTONOTES_TYPES)
.flags(
DatasetFlags::COREFERENCE
.union(DatasetFlags::INTRA_DOC_COREF)
.union(DatasetFlags::NER),
)
.year(2013);
pub static ECBPLUS: DatasetMetadata = DatasetMetadata::new(
"ECB+",
"Event Coreference Bank Plus",
"http://www.newsreader-project.eu/results/data/the-ecb-corpus/",
)
.domain("coreference")
.language("en")
.entity_types(&["Event", "Entity"])
.flags(
DatasetFlags::COREFERENCE
.union(DatasetFlags::INTER_DOC_COREF)
.union(DatasetFlags::EVENT_EXTRACTION),
)
.year(2014);
pub static MULTINERD: DatasetMetadata = DatasetMetadata::new(
"MultiNERD",
"Multilingual NER (10 languages)",
"https://huggingface.co/datasets/Babelscape/multinerd",
)
.domain("multilingual")
.language("multilingual")
.entity_types(&[
"PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA", "MYTH",
"PLANT", "TIME", "VEHI",
])
.flags(DatasetFlags::NER.union(DatasetFlags::MULTILINGUAL))
.year(2022);
pub static FEWNERD: DatasetMetadata = DatasetMetadata::new(
"FewNERD",
"Few-shot NER with fine-grained types",
"https://huggingface.co/datasets/DFKI-SLT/few-nerd",
)
.domain("general")
.language("en")
.entity_types(&[
"person",
"location",
"organization",
"building",
"art",
"product",
"event",
"other",
])
.flags(DatasetFlags::NER.union(DatasetFlags::FEW_SHOT))
.year(2021);
pub static MASAKHANER: DatasetMetadata = DatasetMetadata::new(
"MasakhaNER",
"NER for African languages",
"https://huggingface.co/datasets/masakhaner",
)
.domain("low-resource")
.language("multilingual")
.entity_types(CONLL_TYPES)
.flags(
DatasetFlags::NER
.union(DatasetFlags::MULTILINGUAL)
.union(DatasetFlags::AFRICAN_LANGUAGE)
.union(DatasetFlags::LOW_RESOURCE),
)
.year(2021);
pub static GENIA: DatasetMetadata = DatasetMetadata::new(
"GENIA",
"Biomedical NER (genes, proteins)",
"http://www.geniaproject.org/",
)
.domain("biomedical")
.language("en")
.entity_types(&["DNA", "RNA", "protein", "cell_line", "cell_type"])
.flags(
DatasetFlags::NER
.union(DatasetFlags::BIOMEDICAL)
.union(DatasetFlags::SPECIALIZED_DOMAIN),
)
.year(2003);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_flags_operations() {
let flags = DatasetFlags::NER | DatasetFlags::BIOMEDICAL | DatasetFlags::SPECIALIZED_DOMAIN;
assert!(flags.contains(DatasetFlags::NER));
assert!(flags.contains(DatasetFlags::BIOMEDICAL));
assert!(!flags.contains(DatasetFlags::SOCIAL_MEDIA));
}
#[test]
fn test_metadata_builder() {
let meta = DatasetMetadata::new("Test", "A test dataset", "https://example.com")
.domain("biomedical")
.language("en")
.entity_types(&["Disease", "Drug"])
.flags(DatasetFlags::NER | DatasetFlags::BIOMEDICAL)
.year(2023);
assert_eq!(meta.name, "Test");
assert_eq!(meta.domain, "biomedical");
assert!(meta.is_biomedical());
assert!(!meta.is_social_media());
assert_eq!(meta.year, Some(2023));
}
#[test]
fn test_const_construction() {
const META: DatasetMetadata = DatasetMetadata::new("Const", "Desc", "url")
.domain("test")
.language("en");
assert_eq!(META.name, "Const");
assert_eq!(META.domain, "test");
}
#[test]
fn test_static_wikigold() {
assert_eq!(WIKIGOLD.name, "WikiGold");
assert_eq!(WIKIGOLD.domain, "news");
assert!(WIKIGOLD.is_ner());
assert!(!WIKIGOLD.is_coreference());
}
#[test]
fn test_static_bc5cdr() {
assert!(BC5CDR.is_biomedical());
assert!(BC5CDR.is_specialized_domain());
assert_eq!(BC5CDR.entity_types.len(), 2);
}
#[test]
fn test_static_gap() {
assert!(GAP.is_coreference());
assert!(GAP.is_intra_doc_coref());
assert!(GAP.is_bias_evaluation());
assert!(!GAP.is_ner());
}
#[test]
fn test_static_ecbplus() {
assert!(ECBPLUS.is_inter_doc_coref());
assert!(!ECBPLUS.is_intra_doc_coref());
}
#[test]
fn test_static_masakhaner() {
assert!(MASAKHANER.is_african_language());
assert!(MASAKHANER.is_multilingual());
}
}