Skip to main content

DatasetSpec

Trait DatasetSpec 

Source
pub trait DatasetSpec: Send + Sync {
Show 27 methods // Required methods fn name(&self) -> &str; fn id(&self) -> &str; fn task(&self) -> Task; fn languages(&self) -> &[&str]; fn entity_types(&self) -> &[&str]; fn parser_hint(&self) -> ParserHint; fn license(&self) -> License; // Provided methods fn description(&self) -> Option<&str> { ... } fn domain(&self) -> Domain { ... } fn download_url(&self) -> Option<&str> { ... } fn citation(&self) -> Option<&str> { ... } fn doi(&self) -> Option<&str> { ... } fn local_path(&self) -> Option<&Path> { ... } fn stats(&self) -> DatasetStats { ... } fn temporal_coverage(&self) -> TemporalCoverage { ... } fn secondary_tasks(&self) -> &[Task] { ... } fn is_constructed_language(&self) -> bool { ... } fn is_historical(&self) -> bool { ... } fn requires_auth(&self) -> bool { ... } fn version(&self) -> Option<&str> { ... } fn notes(&self) -> Option<&str> { ... } fn languages_vec(&self) -> Vec<String> { ... } fn entity_types_vec(&self) -> Vec<String> { ... } fn is_public(&self) -> bool { ... } fn supports_task(&self, task: Task) -> bool { ... } fn supports_language(&self, lang: &str) -> bool { ... } fn has_entity_type(&self, entity_type: &str) -> bool { ... }
}
Expand description

Specification for a dataset that can be loaded and evaluated.

This trait is the foundation for both built-in datasets (via the DatasetId enum) and custom user-defined datasets.

§Implementing Custom Datasets

use anno_core::core::dataset::*;

struct MyDataset {
    path: PathBuf,
}

impl DatasetSpec for MyDataset {
    fn name(&self) -> &str { "My Custom Dataset" }
    fn id(&self) -> &str { "my_custom_v1" }
    fn task(&self) -> Task { Task::NER }
    fn languages(&self) -> &[&str] { &["en"] }
    fn entity_types(&self) -> &[&str] { &["PER", "ORG", "LOC"] }
    fn parser_hint(&self) -> ParserHint { ParserHint::CoNLL }
    fn license(&self) -> License { License::Proprietary }

    // Override to provide actual data path
    fn local_path(&self) -> Option<&std::path::Path> {
        Some(&self.path)
    }
}

Required Methods§

Source

fn name(&self) -> &str

Human-readable name of the dataset.

Source

fn id(&self) -> &str

Unique identifier string (snake_case, no spaces).

Source

fn task(&self) -> Task

Primary task this dataset is designed for.

Source

fn languages(&self) -> &[&str]

ISO 639-1 language codes (e.g., “en”, “zh”, “de”).

Use ["multilingual"] for datasets covering many languages.

Source

fn entity_types(&self) -> &[&str]

Entity types annotated in this dataset.

For NER: ["PER", "LOC", "ORG", "MISC"] For biomedical: ["GENE", "DISEASE", "DRUG", "SPECIES"]

Source

fn parser_hint(&self) -> ParserHint

Parser format hint for loading.

Source

fn license(&self) -> License

License governing dataset usage.

Provided Methods§

Source

fn description(&self) -> Option<&str>

Detailed description of the dataset.

Source

fn domain(&self) -> Domain

Domain/genre of source text.

Source

fn download_url(&self) -> Option<&str>

URL for downloading the dataset.

Source

fn citation(&self) -> Option<&str>

Citation information (BibTeX or plain text).

Source

fn doi(&self) -> Option<&str>

DOI or other persistent identifier.

Source

fn local_path(&self) -> Option<&Path>

Local path if already downloaded.

Source

fn stats(&self) -> DatasetStats

Dataset statistics (counts, splits).

Source

fn temporal_coverage(&self) -> TemporalCoverage

Temporal coverage information.

Source

fn secondary_tasks(&self) -> &[Task]

Additional tasks supported beyond the primary task.

Source

fn is_constructed_language(&self) -> bool

Whether this is a constructed/artificial language dataset.

Source

fn is_historical(&self) -> bool

Whether this is a historical/ancient language dataset.

Source

fn requires_auth(&self) -> bool

Whether this dataset requires special access (gated, auth, etc.).

Source

fn version(&self) -> Option<&str>

Version string (e.g., “1.0”, “2024-01”).

Source

fn notes(&self) -> Option<&str>

Notes or caveats about the dataset.

Source

fn languages_vec(&self) -> Vec<String>

Get languages as owned Vec (for custom datasets that don’t have static data).

Default implementation converts from languages().

Source

fn entity_types_vec(&self) -> Vec<String>

Get entity types as owned Vec (for custom datasets that don’t have static data).

Default implementation converts from entity_types().

Source

fn is_public(&self) -> bool

Check if this dataset is publicly available.

Source

fn supports_task(&self, task: Task) -> bool

Check if this dataset supports a specific task.

Source

fn supports_language(&self, lang: &str) -> bool

Check if this dataset covers a specific language.

Source

fn has_entity_type(&self, entity_type: &str) -> bool

Check if this dataset has a specific entity type.

Implementors§