entrenar/hf_pipeline/dataset/
options.rs1use std::path::PathBuf;
4
5use super::split::Split;
6
7#[derive(Debug, Clone)]
9pub struct DatasetOptions {
10 pub split: Split,
12 pub max_examples: Option<usize>,
14 pub streaming: bool,
16 pub shuffle: bool,
18 pub seed: Option<u64>,
20 pub cache_dir: Option<PathBuf>,
22}
23
24impl Default for DatasetOptions {
25 fn default() -> Self {
26 Self {
27 split: Split::Train,
28 max_examples: None,
29 streaming: false,
30 shuffle: true,
31 seed: Some(42),
32 cache_dir: None,
33 }
34 }
35}
36
37impl DatasetOptions {
38 #[must_use]
40 pub fn train() -> Self {
41 Self::default()
42 }
43
44 #[must_use]
46 pub fn validation() -> Self {
47 Self { split: Split::Validation, shuffle: false, ..Default::default() }
48 }
49
50 #[must_use]
52 pub fn test() -> Self {
53 Self { split: Split::Test, shuffle: false, ..Default::default() }
54 }
55
56 #[must_use]
58 pub fn max_examples(mut self, n: usize) -> Self {
59 self.max_examples = Some(n);
60 self
61 }
62
63 #[must_use]
65 pub fn streaming(mut self, enabled: bool) -> Self {
66 self.streaming = enabled;
67 self
68 }
69
70 #[must_use]
72 pub fn shuffle(mut self, enabled: bool) -> Self {
73 self.shuffle = enabled;
74 self
75 }
76
77 #[must_use]
79 pub fn seed(mut self, seed: u64) -> Self {
80 self.seed = Some(seed);
81 self
82 }
83}