Skip to main content

entrenar/hf_pipeline/dataset/
dataset_impl.rs

1//! Dataset struct and implementation
2
3use super::example::Example;
4
5/// Dataset abstraction
6pub struct Dataset {
7    /// Dataset name/ID
8    name: String,
9    /// Examples
10    examples: Vec<Example>,
11    /// Current position for iteration
12    position: usize,
13}
14
15impl Dataset {
16    /// Create new dataset from examples
17    #[must_use]
18    pub fn new(name: impl Into<String>, examples: Vec<Example>) -> Self {
19        Self { name: name.into(), examples, position: 0 }
20    }
21
22    /// Create mock dataset for testing
23    #[must_use]
24    pub fn mock(num_examples: usize, seq_len: usize) -> Self {
25        let examples: Vec<Example> = (0..num_examples)
26            .map(|i| {
27                Example::from_tokens((0..seq_len).map(|j| ((i + j) % 30000) as u32).collect())
28                    .with_labels((0..seq_len).map(|j| ((i + j + 1) % 30000) as u32).collect())
29            })
30            .collect();
31
32        Self::new("mock_dataset", examples)
33    }
34
35    /// Get dataset name
36    #[must_use]
37    pub fn name(&self) -> &str {
38        &self.name
39    }
40
41    /// Get number of examples
42    #[must_use]
43    pub fn len(&self) -> usize {
44        self.examples.len()
45    }
46
47    /// Check if empty
48    #[must_use]
49    pub fn is_empty(&self) -> bool {
50        self.examples.is_empty()
51    }
52
53    /// Get example by index
54    #[must_use]
55    pub fn get(&self, index: usize) -> Option<&Example> {
56        self.examples.get(index)
57    }
58
59    /// Get all examples
60    #[must_use]
61    pub fn examples(&self) -> &[Example] {
62        &self.examples
63    }
64
65    /// Reset iteration position
66    pub fn reset(&mut self) {
67        self.position = 0;
68    }
69
70    /// Shuffle examples
71    pub fn shuffle(&mut self, seed: u64) {
72        use rand::prelude::*;
73        let mut rng = StdRng::seed_from_u64(seed);
74        self.examples.shuffle(&mut rng);
75    }
76
77    /// Take a subset of examples
78    #[must_use]
79    pub fn take(mut self, n: usize) -> Self {
80        self.examples.truncate(n);
81        self
82    }
83}
84
85impl Iterator for Dataset {
86    type Item = Example;
87
88    fn next(&mut self) -> Option<Self::Item> {
89        if self.position < self.examples.len() {
90            let example = self.examples[self.position].clone();
91            self.position += 1;
92            Some(example)
93        } else {
94            None
95        }
96    }
97}