pub trait DatasetSpec: Send + Sync {
Show 27 methods
// Required methods
fn name(&self) -> &str;
fn id(&self) -> &str;
fn task(&self) -> Task;
fn languages(&self) -> &[&str];
fn entity_types(&self) -> &[&str];
fn parser_hint(&self) -> ParserHint;
fn license(&self) -> License;
// Provided methods
fn description(&self) -> Option<&str> { ... }
fn domain(&self) -> Domain { ... }
fn download_url(&self) -> Option<&str> { ... }
fn citation(&self) -> Option<&str> { ... }
fn doi(&self) -> Option<&str> { ... }
fn local_path(&self) -> Option<&Path> { ... }
fn stats(&self) -> DatasetStats { ... }
fn temporal_coverage(&self) -> TemporalCoverage { ... }
fn secondary_tasks(&self) -> &[Task] { ... }
fn is_constructed_language(&self) -> bool { ... }
fn is_historical(&self) -> bool { ... }
fn requires_auth(&self) -> bool { ... }
fn version(&self) -> Option<&str> { ... }
fn notes(&self) -> Option<&str> { ... }
fn languages_vec(&self) -> Vec<String> { ... }
fn entity_types_vec(&self) -> Vec<String> { ... }
fn is_public(&self) -> bool { ... }
fn supports_task(&self, task: Task) -> bool { ... }
fn supports_language(&self, lang: &str) -> bool { ... }
fn has_entity_type(&self, entity_type: &str) -> bool { ... }
}Expand description
Specification for a dataset that can be loaded and evaluated.
This trait is the foundation for both built-in datasets (via the
DatasetId enum) and custom user-defined datasets.
§Implementing Custom Datasets
use anno_core::core::dataset::*;
struct MyDataset {
path: PathBuf,
}
impl DatasetSpec for MyDataset {
fn name(&self) -> &str { "My Custom Dataset" }
fn id(&self) -> &str { "my_custom_v1" }
fn task(&self) -> Task { Task::NER }
fn languages(&self) -> &[&str] { &["en"] }
fn entity_types(&self) -> &[&str] { &["PER", "ORG", "LOC"] }
fn parser_hint(&self) -> ParserHint { ParserHint::CoNLL }
fn license(&self) -> License { License::Proprietary }
// Override to provide actual data path
fn local_path(&self) -> Option<&std::path::Path> {
Some(&self.path)
}
}Required Methods§
Sourcefn languages(&self) -> &[&str]
fn languages(&self) -> &[&str]
ISO 639-1 language codes (e.g., “en”, “zh”, “de”).
Use ["multilingual"] for datasets covering many languages.
Sourcefn entity_types(&self) -> &[&str]
fn entity_types(&self) -> &[&str]
Entity types annotated in this dataset.
For NER: ["PER", "LOC", "ORG", "MISC"]
For biomedical: ["GENE", "DISEASE", "DRUG", "SPECIES"]
Sourcefn parser_hint(&self) -> ParserHint
fn parser_hint(&self) -> ParserHint
Parser format hint for loading.
Provided Methods§
Sourcefn description(&self) -> Option<&str>
fn description(&self) -> Option<&str>
Detailed description of the dataset.
Sourcefn download_url(&self) -> Option<&str>
fn download_url(&self) -> Option<&str>
URL for downloading the dataset.
Sourcefn local_path(&self) -> Option<&Path>
fn local_path(&self) -> Option<&Path>
Local path if already downloaded.
Sourcefn stats(&self) -> DatasetStats
fn stats(&self) -> DatasetStats
Dataset statistics (counts, splits).
Sourcefn temporal_coverage(&self) -> TemporalCoverage
fn temporal_coverage(&self) -> TemporalCoverage
Temporal coverage information.
Sourcefn secondary_tasks(&self) -> &[Task]
fn secondary_tasks(&self) -> &[Task]
Additional tasks supported beyond the primary task.
Sourcefn is_constructed_language(&self) -> bool
fn is_constructed_language(&self) -> bool
Whether this is a constructed/artificial language dataset.
Sourcefn is_historical(&self) -> bool
fn is_historical(&self) -> bool
Whether this is a historical/ancient language dataset.
Sourcefn requires_auth(&self) -> bool
fn requires_auth(&self) -> bool
Whether this dataset requires special access (gated, auth, etc.).
Sourcefn languages_vec(&self) -> Vec<String>
fn languages_vec(&self) -> Vec<String>
Get languages as owned Vec (for custom datasets that don’t have static data).
Default implementation converts from languages().
Sourcefn entity_types_vec(&self) -> Vec<String>
fn entity_types_vec(&self) -> Vec<String>
Get entity types as owned Vec (for custom datasets that don’t have static data).
Default implementation converts from entity_types().
Sourcefn supports_task(&self, task: Task) -> bool
fn supports_task(&self, task: Task) -> bool
Check if this dataset supports a specific task.
Sourcefn supports_language(&self, lang: &str) -> bool
fn supports_language(&self, lang: &str) -> bool
Check if this dataset covers a specific language.
Sourcefn has_entity_type(&self, entity_type: &str) -> bool
fn has_entity_type(&self, entity_type: &str) -> bool
Check if this dataset has a specific entity type.