Skip to main content

entrenar/hf_pipeline/dataset/
options.rs

1//! Dataset fetch options
2
3use std::path::PathBuf;
4
5use super::split::Split;
6
7/// Dataset fetch options
8#[derive(Debug, Clone)]
9pub struct DatasetOptions {
10    /// Dataset split to load
11    pub split: Split,
12    /// Maximum number of examples (None = all)
13    pub max_examples: Option<usize>,
14    /// Stream data instead of loading all at once
15    pub streaming: bool,
16    /// Shuffle data
17    pub shuffle: bool,
18    /// Random seed for shuffling
19    pub seed: Option<u64>,
20    /// Cache directory
21    pub cache_dir: Option<PathBuf>,
22}
23
24impl Default for DatasetOptions {
25    fn default() -> Self {
26        Self {
27            split: Split::Train,
28            max_examples: None,
29            streaming: false,
30            shuffle: true,
31            seed: Some(42),
32            cache_dir: None,
33        }
34    }
35}
36
37impl DatasetOptions {
38    /// Create new options for training split
39    #[must_use]
40    pub fn train() -> Self {
41        Self::default()
42    }
43
44    /// Create new options for validation split
45    #[must_use]
46    pub fn validation() -> Self {
47        Self { split: Split::Validation, shuffle: false, ..Default::default() }
48    }
49
50    /// Create new options for test split
51    #[must_use]
52    pub fn test() -> Self {
53        Self { split: Split::Test, shuffle: false, ..Default::default() }
54    }
55
56    /// Set maximum examples
57    #[must_use]
58    pub fn max_examples(mut self, n: usize) -> Self {
59        self.max_examples = Some(n);
60        self
61    }
62
63    /// Enable streaming
64    #[must_use]
65    pub fn streaming(mut self, enabled: bool) -> Self {
66        self.streaming = enabled;
67        self
68    }
69
70    /// Set shuffle
71    #[must_use]
72    pub fn shuffle(mut self, enabled: bool) -> Self {
73        self.shuffle = enabled;
74        self
75    }
76
77    /// Set random seed
78    #[must_use]
79    pub fn seed(mut self, seed: u64) -> Self {
80        self.seed = Some(seed);
81        self
82    }
83}