pub use super::dataset::{AnnotatedExample, Difficulty, Domain};
pub use super::dataset::synthetic::{
academic_dataset,
adversarial_dataset,
aerospace_dataset,
all_datasets,
automotive_dataset,
biomedical_dataset,
conversational_dataset,
cybersecurity_dataset,
ecommerce_dataset,
energy_dataset,
entertainment_dataset,
financial_dataset,
food_dataset,
globally_diverse_dataset,
hard_domain_examples,
healthcare_dataset,
historical_dataset,
legal_dataset,
manufacturing_dataset,
multilingual_dataset,
news_dataset,
politics_dataset,
real_estate_dataset,
scientific_dataset,
social_media_dataset,
sports_dataset,
structured_dataset,
technology_dataset,
travel_dataset,
weather_dataset,
};
use std::collections::HashMap;
#[inline]
pub fn conll_style_dataset() -> Vec<AnnotatedExample> {
news_dataset()
}
pub fn datasets_by_domain(domain: Domain) -> Vec<AnnotatedExample> {
super::dataset::synthetic::by_domain(domain)
}
pub fn datasets_by_difficulty(difficulty: Difficulty) -> Vec<AnnotatedExample> {
super::dataset::synthetic::by_difficulty(difficulty)
}
#[derive(Debug, Clone)]
pub struct DatasetStats {
pub total_examples: usize,
pub total_entities: usize,
pub examples_per_domain: HashMap<String, usize>,
pub examples_per_difficulty: HashMap<String, usize>,
}
pub fn dataset_stats() -> DatasetStats {
let stats = super::dataset::synthetic::stats();
DatasetStats {
total_examples: stats.total_examples,
total_entities: stats.total_entities,
examples_per_domain: stats.domains,
examples_per_difficulty: stats.difficulties,
}
}
pub fn extended_quality_dataset() -> Vec<AnnotatedExample> {
let mut all = Vec::new();
all.extend(hard_domain_examples());
all.extend(globally_diverse_dataset());
all.extend(adversarial_dataset());
all
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_all_datasets() {
let all = all_datasets();
assert!(!all.is_empty());
assert!(all.len() >= 100, "Expected at least 100 examples");
}
#[test]
fn test_conll_alias() {
let conll = conll_style_dataset();
let news = news_dataset();
assert_eq!(conll.len(), news.len());
}
#[test]
fn test_datasets_by_domain() {
let news = datasets_by_domain(Domain::News);
assert!(!news.is_empty());
for ex in &news {
assert_eq!(ex.domain, Domain::News);
}
}
#[test]
fn test_datasets_by_difficulty() {
let hard = datasets_by_difficulty(Difficulty::Hard);
for ex in &hard {
assert_eq!(ex.difficulty, Difficulty::Hard);
}
}
#[test]
fn test_dataset_stats() {
let stats = dataset_stats();
assert!(stats.total_examples > 0);
assert!(stats.total_entities > 0);
assert!(!stats.examples_per_domain.is_empty());
}
#[test]
fn test_extended_quality_dataset() {
let extended = extended_quality_dataset();
assert!(!extended.is_empty());
}
}